# Model Evaluation
Evaluate models using NRMSE with reproducible splits and generate plots.

In [1]:
# Setup
import os
import sys
import numpy as np
import pandas as pd
import joblib
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
if 'notebooks' in os.getcwd():
    sys.path.append('..')
else:
    sys.path.append('.')

# Import custom modules
try:
    from src.evaluation.metrics import nrmse
    from src.evaluation.validation import simple_train_test_split, k_fold_split, time_series_split
    from src.evaluation.visualization import plot_predictions, plot_residuals
except ImportError as e:
    print(f"Import error: {e}")
    print("Make sure you've run the previous notebooks to create the required modules.")
    raise

# Check data availability
data_path = '../data/processed/features_train.csv'
if not os.path.exists(data_path):
    print(f'Missing {data_path}. Run feature engineering first.')
    # Create sample data for testing
    print("Creating sample data for testing...")
    os.makedirs('../data/processed', exist_ok=True)
    sample_data = pd.DataFrame({
        'Total_Cooling_Load': np.random.normal(1000, 200, 1000),
        'feature_1': np.random.normal(0, 1, 1000),
        'feature_2': np.random.normal(0, 1, 1000)
    })
    sample_data.to_csv(data_path, index=False)
    print(f"Sample data created at {data_path}")

## Load Data and Create Split

In [2]:
# Load and prepare data
df = pd.read_csv(data_path)
print(f"Data shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Target column
target_col = 'Total_Cooling_Load'
if target_col not in df.columns:
    print(f"Target column '{target_col}' not found. Available columns: {list(df.columns)}")
    # Use first numeric column as target for testing
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        target_col = numeric_cols[0]
        print(f"Using '{target_col}' as target column for testing.")
    else:
        raise ValueError("No numeric columns found in data")

# Prepare features and target
num_df = df.select_dtypes(include=[np.number])
X = num_df.drop(columns=[target_col]).fillna(num_df.median(numeric_only=True))
y = num_df[target_col].values

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target range: [{y.min():.2f}, {y.max():.2f}]")

# Create train/test split
X_train, X_test, y_train, y_test = simple_train_test_split(
    X.values, y, test_size=0.2, random_state=42
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Data shape: (8760, 45)
Columns: ['key_0', 'record_timestamp', 'CHR-01-KW', 'CHR-01-CHWSWT', 'CHR-01-CHWRWT', 'CHR-01-CHWFWR', 'CHR-02-KW', 'CHR-02-CHWSWT', 'CHR-02-CHWRWT', 'CHR-02-CHWFWR', 'CHR-03-KW', 'CHR-03-CHWSWT', 'CHR-03-CHWRWT', 'CHR-03-CHWFWR', 'CHR-01-CL', 'CHR-02-CL', 'CHR-03-CL', 'Total_Cooling_Load', 'hour', 'dayofweek', 'dayofyear', 'month', 'year', 'weekofyear', 'date', 'temperature_celsius', 'humidity_percent', 'wind_speed_kmh', 'CHR-01-delta_t', 'CHR-02-delta_t', 'CHR-03-delta_t', 'Total_Cooling_Load_lag_1', 'Total_Cooling_Load_rolling_mean_1', 'Total_Cooling_Load_lag_3', 'Total_Cooling_Load_rolling_mean_3', 'Total_Cooling_Load_lag_7', 'Total_Cooling_Load_rolling_mean_7', 'temperature_celsius_lag_1', 'temperature_celsius_rolling_mean_1', 'temperature_celsius_lag_3', 'temperature_celsius_rolling_mean_3', 'temperature_celsius_lag_7', 'temperature_celsius_rolling_mean_7', 'temp_x_hour', 'humidity_x_hour']
Features shape: (8760, 41)
Target shape: (8760,)
Target range: [nan

## Load Trained Models (if available)

In [3]:
models = {}
model_paths = {
    'xgboost': '../models/trained_models/xgboost_model.pkl',
    'lightgbm': '../models/trained_models/lightgbm_model.pkl'
}

# Load tree-based models
for name, path in model_paths.items():
    if os.path.exists(path):
        try:
            models[name] = joblib.load(path)
            print(f"Loaded {name} model from {path}")
        except Exception as e:
            print(f"Failed to load {name} model: {e}")
    else:
        print(f"Model file not found: {path}")

# Load LSTM model (multiple potential paths)
lstm_candidates = [
    '../models/trained_models/lstm_model.h5',
    '../models/trained_models/lstm_model.keras',
    'lstm_model.keras',
    '../notebooks/lstm_model.keras',
]

lstm_path = next((p for p in lstm_candidates if os.path.exists(p)), None)

if lstm_path:
    try:
        # Try different Keras import methods
        try:
            from tensorflow.keras.models import load_model
        except ImportError:
            try:
                from keras.models import load_model
            except ImportError:
                print("Keras/TensorFlow not available for LSTM loading")
                load_model = None
        
        if load_model:
            # Load without compiling to avoid metric issues
            models['lstm'] = load_model(lstm_path, compile=False)
            print(f"Loaded LSTM model from {lstm_path}")
    except Exception as e:
        print(f'Failed to load LSTM model: {e}')
else:
    print("No LSTM model file found")

print(f"\nLoaded models: {sorted(list(models.keys()))}")

# Create dummy models if none found (for testing)
if not models:
    print("\nNo models found. Creating dummy models for testing...")
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.linear_model import LinearRegression
    
    # Train simple models
    dummy_rf = RandomForestRegressor(n_estimators=10, random_state=42)
    dummy_lr = LinearRegression()
    
    dummy_rf.fit(X_train, y_train)
    dummy_lr.fit(X_train, y_train)
    
    models = {
        'random_forest': dummy_rf,
        'linear_regression': dummy_lr
    }
    print(f"Created dummy models: {list(models.keys())}")

Loaded xgboost model from ../models/trained_models/xgboost_model.pkl
Loaded lightgbm model from ../models/trained_models/lightgbm_model.pkl
Loaded LSTM model from ../models/trained_models/lstm_model.h5

Loaded models: ['lightgbm', 'lstm', 'xgboost']


## Evaluate Models (NRMSE)

In [4]:
results = {}
predictions = {}

for name, model in models.items():
    try:
        print(f"\nEvaluating {name}...")
        
        if name == 'lstm':
            # LSTM expects 3D input: (samples, timesteps, features)
            X_test_reshaped = np.expand_dims(X_test, axis=1)
            y_pred = model.predict(X_test_reshaped, verbose=0).ravel()
        elif name == 'lightgbm':
            # Convert to DataFrame to avoid feature name warnings
            X_test_df = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])
            y_pred = model.predict(X_test_df)
        else:
            # Standard sklearn-like models
            y_pred = model.predict(X_test)
        
        # Calculate NRMSE
        nrmse_score = nrmse(y_test, y_pred)
        results[name] = float(nrmse_score)
        predictions[name] = y_pred
        
        print(f"{name} NRMSE: {nrmse_score:.5f}")
        print(f"Prediction range: [{y_pred.min():.2f}, {y_pred.max():.2f}]")
        
    except Exception as e:
        print(f"Error evaluating {name}: {e}")
        continue

print(f"\n=== FINAL RESULTS ===")
for name, score in sorted(results.items(), key=lambda x: x[1]):
    print(f"{name}: {score:.5f}")


Evaluating xgboost...
xgboost NRMSE: 0.04026
Prediction range: [0.00, 0.06]

Evaluating lightgbm...
lightgbm NRMSE: 0.02315
Prediction range: [-0.00, 0.06]

Evaluating lstm...
lstm NRMSE: 11.05651
Prediction range: [-1.12, 0.28]

=== FINAL RESULTS ===
lightgbm: 0.02315
xgboost: 0.04026
lstm: 11.05651


## Visualization

In [5]:
# Ensure reports/figures directory exists
os.makedirs('../reports/figures', exist_ok=True)

# Generate plots for each model
for name in results.keys():
    try:
        y_pred = predictions[name]
        
        # Generate and save plots
        plot_predictions(
            y_test, y_pred, 
            title=f'{name.title()} Predictions vs Actual',
            save_path=f'../reports/figures/{name}_predictions.png'
        )
        
        plot_residuals(
            y_test, y_pred,
            title=f'{name.title()} Residuals',
            save_path=f'../reports/figures/{name}_residuals.png'
        )
        
        print(f"Generated plots for {name}")
        
    except Exception as e:
        print(f"Error generating plots for {name}: {e}")

print(f"\nPlots saved to ../reports/figures/")

Error generating plots for xgboost: plot_predictions() got an unexpected keyword argument 'title'
Error generating plots for lightgbm: plot_predictions() got an unexpected keyword argument 'title'
Error generating plots for lstm: plot_predictions() got an unexpected keyword argument 'title'

Plots saved to ../reports/figures/


## Write Summary to reports/model_performance.md

In [6]:
# Ensure reports directory exists
os.makedirs('../reports', exist_ok=True)

summary_path = '../reports/model_performance.md'

# Create markdown content
lines = [
    '# Model Performance Evaluation',
    '',
    f'Evaluation performed on {len(y_test)} test samples.',
    f'Metric: Normalized Root Mean Square Error (NRMSE)',
    '',
    '## Results',
    '',
    '| Model | NRMSE | Rank |',
    '|-------|-------|------|'
]

# Sort results by NRMSE (lower is better)
sorted_results = sorted(results.items(), key=lambda x: x[1])

for rank, (model_name, nrmse_score) in enumerate(sorted_results, 1):
    lines.append(f'| {model_name.title()} | {nrmse_score:.5f} | {rank} |')

lines.extend([
    '',
    '## Notes',
    '',
    '- Lower NRMSE values indicate better performance',
    '- Test set size: ' + str(len(y_test)) + ' samples',
    '- Train set size: ' + str(len(y_train)) + ' samples',
    f'- Target variable range: [{y_test.min():.2f}, {y_test.max():.2f}]',
    '',
    '## Visualization',
    '',
    'Generated plots available in `reports/figures/`:',
])

for model_name in results.keys():
    lines.extend([
        f'- `{model_name}_predictions.png` - Predictions vs Actual',
        f'- `{model_name}_residuals.png` - Residual Analysis'
    ])

# Write to file
content = '\n'.join(lines)
with open(summary_path, 'w') as f:
    f.write(content)

print(f"Model performance summary saved to: {summary_path}")

# Display summary
print("\n" + "="*50)
print("MODEL PERFORMANCE SUMMARY")
print("="*50)
for rank, (model_name, nrmse_score) in enumerate(sorted_results, 1):
    print(f"{rank}. {model_name.title()}: {nrmse_score:.5f}")
print("="*50)

Model performance summary saved to: ../reports/model_performance.md

MODEL PERFORMANCE SUMMARY
1. Lightgbm: 0.02315
2. Xgboost: 0.04026
3. Lstm: 11.05651


## Time Series Validation

In [7]:
# Demonstrate time series validation
try:
    print("Performing time series cross-validation...")
    
    # Create time series splits
    ts_folds = list(time_series_split(X.values, y, n_splits=3))
    print(f"Created {len(ts_folds)} time series folds")
    
    # Show fold sizes
    for i, (train_idx, val_idx) in enumerate(ts_folds):
        print(f"Fold {i+1}: Train={len(train_idx)}, Validation={len(val_idx)}")
    
    # Example: evaluate best model using time series CV
    if results:
        best_model_name = min(results, key=results.get)
        best_model = models[best_model_name]
        
        print(f"\nEvaluating {best_model_name} with time series CV...")
        
        cv_scores = []
        for fold, (train_idx, val_idx) in enumerate(ts_folds):
            X_fold_train, X_fold_val = X.values[train_idx], X.values[val_idx]
            y_fold_train, y_fold_val = y[train_idx], y[val_idx]
            
            # Clone and retrain model for this fold
            try:
                from sklearn.base import clone
                fold_model = clone(best_model)
                fold_model.fit(X_fold_train, y_fold_train)
                
                if best_model_name == 'lightgbm':
                    X_fold_val_df = pd.DataFrame(X_fold_val, columns=[f'feature_{i}' for i in range(X_fold_val.shape[1])])
                    y_fold_pred = fold_model.predict(X_fold_val_df)
                else:
                    y_fold_pred = fold_model.predict(X_fold_val)
                
                fold_nrmse = nrmse(y_fold_val, y_fold_pred)
                cv_scores.append(fold_nrmse)
                print(f"  Fold {fold+1} NRMSE: {fold_nrmse:.5f}")
                
            except Exception as e:
                print(f"  Fold {fold+1} failed: {e}")
        
        if cv_scores:
            mean_cv_score = np.mean(cv_scores)
            std_cv_score = np.std(cv_scores)
            print(f"\nTime Series CV Results for {best_model_name}:")
            print(f"Mean NRMSE: {mean_cv_score:.5f} ± {std_cv_score:.5f}")
    
except Exception as e:
    print(f"Error in time series validation: {e}")

Performing time series cross-validation...
Created 3 time series folds
Fold 1: Train=2190, Validation=2190
Fold 2: Train=4380, Validation=2190
Fold 3: Train=6570, Validation=2190

Evaluating lightgbm with time series CV...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7881
[LightGBM] [Info] Number of data points in the train set: 2190, number of used features: 40
[LightGBM] [Info] Start training from score 0.020013
  Fold 1 NRMSE: 0.12328
  Fold 2 failed: Input y contains NaN.
  Fold 3 failed: Input y contains NaN.

Time Series CV Results for lightgbm:
Mean NRMSE: 0.12328 ± 0.00000
