In [4]:
import numpy as np
import torch
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

def calculate_metrics(y_true, y_pred):
    """Calculate evaluation metrics"""
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    return mse, rmse, mae

def naive_baseline(X_test, y_test):
    """Naive baseline: predict last known value"""
    # Assuming the target is the last feature of the last timestep
    last_values = X_test[:, -1, -1]  # Last timestep, last feature
    y_pred = last_values
    y_true = y_test
    
    if len(y_true.shape) > 1:
        y_true = y_true.flatten()
    
    return calculate_metrics(y_true, y_pred)

def seasonal_naive_baseline(X_test, y_test, season_length=24):
    """Seasonal naive: predict value from same time previous season"""
    # For weather data, might use daily cycle (24 hours) or weekly cycle
    y_true = y_test
    if len(y_true.shape) > 1:
        y_true = y_true.flatten()
    
    # Simple seasonal naive - use last available seasonal value
    y_pred = np.roll(y_true, season_length)[:len(y_true)]
    # For first season_length predictions, use naive approach
    y_pred[:season_length] = y_true[:season_length]
    
    return calculate_metrics(y_true, y_pred)

def moving_average_baseline(X_test, y_test, window=3):
    """Moving average baseline"""
    # Use last 'window' values from the sequence to predict next value
    y_true = y_test
    if len(y_true.shape) > 1:
        y_true = y_true.flatten()
    
    X_numpy = X_test
    
    # Assuming target variable is the last feature
    y_pred = []
    for i in range(len(X_numpy)):
        last_values = X_numpy[i, -window:, -1]  # Last 'window' timesteps, last feature
        pred = np.mean(last_values)
        y_pred.append(pred)
    
    y_pred = np.array(y_pred)
    return calculate_metrics(y_true, y_pred)

def linear_regression_baseline(X_train, y_train, X_test, y_test):
    """Linear regression baseline using flattened features"""
    # Flatten the sequence data
    X_train_flat = X_train.reshape(X_train.shape[0], -1)
    X_test_flat = X_test.reshape(X_test.shape[0], -1)
    
    y_train_flat = y_train.flatten()
    y_test_flat = y_test.flatten()
    
    # Train linear regression
    lr = LinearRegression()
    lr.fit(X_train_flat, y_train_flat)
    
    # Predict
    y_pred = lr.predict(X_test_flat)
    
    return calculate_metrics(y_test_flat, y_pred)

# def random_forest_baseline(X_train, y_train, X_test, y_test, n_estimators=100):
#     """Random Forest baseline using flattened features"""
#     # Flatten the sequence data
#     X_train_flat = X_train.reshape(X_train.shape[0], -1)
#     X_test_flat = X_test.reshape(X_test.shape[0], -1)
    
#     y_train_flat = y_train.flatten()
#     y_test_flat = y_test.flatten()
    
#     # Train random forest
#     rf = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
#     rf.fit(X_train_flat, y_train_flat)
    
#     # Predict
#     y_pred = rf.predict(X_test_flat)
    
#     return calculate_metrics(y_test_flat, y_pred)

def run_all_baselines(X_train, y_train, X_test, y_test):
    """Run all baseline models and compare results"""
    
    print("=" * 60)
    print("BASELINE MODEL COMPARISON")
    print("=" * 60)
    
    # Your CNN-GRU-LSTM results for comparison
    cnn_gru_lstm_results = {
        'MSE': 0.0951,
        'RMSE': 0.3084,
        'MAE': 0.2349
    }
    
    results = {}
    
    # 1. Naive Baseline
    print("\n1. Naive Baseline (Last Value)...")
    try:
        mse, rmse, mae = naive_baseline(X_test, y_test)
        results['Naive'] = {'MSE': mse, 'RMSE': rmse, 'MAE': mae}
        print(f"   MSE: {mse:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")
    except Exception as e:
        print(f"   Error: {e}")
    
    # 2. Moving Average
    print("\n2. Moving Average Baseline (window=3)...")
    try:
        mse, rmse, mae = moving_average_baseline(X_test, y_test, window=3)
        results['Moving Avg'] = {'MSE': mse, 'RMSE': rmse, 'MAE': mae}
        print(f"   MSE: {mse:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")
    except Exception as e:
        print(f"   Error: {e}")
    
    # 3. Seasonal Naive
    print("\n3. Seasonal Naive Baseline (24-hour cycle)...")
    try:
        mse, rmse, mae = seasonal_naive_baseline(X_test, y_test, season_length=24)
        results['Seasonal Naive'] = {'MSE': mse, 'RMSE': rmse, 'MAE': mae}
        print(f"   MSE: {mse:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")
    except Exception as e:
        print(f"   Error: {e}")
    
    # 4. Linear Regression
    print("\n4. Linear Regression Baseline...")
    try:
        mse, rmse, mae = linear_regression_baseline(X_train, y_train, X_test, y_test)
        results['Linear Regression'] = {'MSE': mse, 'RMSE': rmse, 'MAE': mae}
        print(f"   MSE: {mse:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")
    except Exception as e:
        print(f"   Error: {e}")
    
    # # 5. Random Forest
    # print("\n5. Random Forest Baseline...")
    # try:
    #     mse, rmse, mae = random_forest_baseline(X_train, y_train, X_test, y_test)
    #     results['Random Forest'] = {'MSE': mse, 'RMSE': rmse, 'MAE': mae}
    #     print(f"   MSE: {mse:.4f} | RMSE: {rmse:.4f} | MAE: {mae:.4f}")
    # except Exception as e:
    #     print(f"   Error: {e}")
    
    # Comparison Summary
    print("\n" + "=" * 60)
    print("RESULTS SUMMARY")
    print("=" * 60)
    
    print(f"{'Model':<20} {'MSE':<10} {'RMSE':<10} {'MAE':<10}")
    print("-" * 50)
    
    # Add CNN-GRU-LSTM results
    print(f"{'CNN-GRU-LSTM':<20} {cnn_gru_lstm_results['MSE']:<10.4f} {cnn_gru_lstm_results['RMSE']:<10.4f} {cnn_gru_lstm_results['MAE']:<10.4f}")
    
    # Add baseline results
    for model_name, metrics in results.items():
        print(f"{model_name:<20} {metrics['MSE']:<10.4f} {metrics['RMSE']:<10.4f} {metrics['MAE']:<10.4f}")
    
    # Calculate improvements
    print("\n" + "=" * 60)
    print("IMPROVEMENT ANALYSIS")
    print("=" * 60)
    
    for model_name, metrics in results.items():
        rmse_improvement = ((metrics['RMSE'] - cnn_gru_lstm_results['RMSE']) / metrics['RMSE']) * 100
        mae_improvement = ((metrics['MAE'] - cnn_gru_lstm_results['MAE']) / metrics['MAE']) * 100
        
        print(f"\nCNN-GRU-LSTM vs {model_name}:")
        print(f"  RMSE improvement: {rmse_improvement:+.2f}%")
        print(f"  MAE improvement: {mae_improvement:+.2f}%")
    
    return results

# Example usage:
if __name__ == "__main__":
    # Load your data from .npy files
    print("Loading data from .npy files...")
    
    try:
        # Load train data
        X_train = np.load('data/X_train.npy')
        y_train = np.load('data/y_train.npy')
        
        # Load test data  
        X_test = np.load('data/X_test.npy')
        y_test = np.load('data/y_test.npy')
        
        print(f"Train data shape: {X_train.shape}, {y_train.shape}")
        print(f"Test data shape: {X_test.shape}, {y_test.shape}")
        
        # Run baseline comparison
        baseline_results = run_all_baselines(X_train, y_train, X_test, y_test)
        
    except FileNotFoundError as e:
        print(f"Data files not found: {e}")
        print("Please modify the file paths to match your data location.")
        print("Common locations to check:")
        print("- data/X_train.npy, data/y_train.npy")
        print("- data/X_test.npy, data/y_test.npy") 
        print("- ./X_train.npy, ./y_train.npy (current directory)")
        print("- processed_data/X_train.npy, etc.")
        print("\nExpected data format:")
        print("- X_train: [samples, timesteps, features]")
        print("- y_train: [samples, 1] or [samples]")
        print("- X_test: [samples, timesteps, features]") 
        print("- y_test: [samples, 1] or [samples]")

Loading data from .npy files...
Train data shape: (22750, 7, 13), (22750,)
Test data shape: (4876, 7, 13), (4876,)
BASELINE MODEL COMPARISON

1. Naive Baseline (Last Value)...
   MSE: 0.8182 | RMSE: 0.9045 | MAE: 0.7579

2. Moving Average Baseline (window=3)...
   MSE: 0.8224 | RMSE: 0.9068 | MAE: 0.7592

3. Seasonal Naive Baseline (24-hour cycle)...
   MSE: 0.5641 | RMSE: 0.7511 | MAE: 0.5930

4. Linear Regression Baseline...
   MSE: 2.3434 | RMSE: 1.5308 | MAE: 0.3115

RESULTS SUMMARY
Model                MSE        RMSE       MAE       
--------------------------------------------------
CNN-GRU-LSTM         0.0951     0.3084     0.2349    
Naive                0.8182     0.9045     0.7579    
Moving Avg           0.8224     0.9068     0.7592    
Seasonal Naive       0.5641     0.7511     0.5930    
Linear Regression    2.3434     1.5308     0.3115    

IMPROVEMENT ANALYSIS

CNN-GRU-LSTM vs Naive:
  RMSE improvement: +65.90%
  MAE improvement: +69.01%

CNN-GRU-LSTM vs Moving Avg:
  R