# Notebook 3: AI Modelling (FIXED Feature Engineering)

## Introduction

Loads data from Notebook 1, adds PROPERLY IMPLEMENTED enhanced features, trains RF and LSTM for horizons 1,3,6,12,24h.

**CRITICAL FIX**: The previous version had incomplete feature engineering that would still cause straight-line predictions.

In [None]:
# Mount Google Drive
from google.colab import drive
import os

# Mount your Google Drive
drive.mount('/content/drive')

# Define your project folder in Google Drive
your_project_path = '/content/drive/My Drive/AI_Sustainability_Project_lsa'

# Create the project directory if it doesn't exist
os.makedirs(your_project_path, exist_ok=True)
print(f"Project path set to: {your_project_path}")

# Change current working directory to your project path
%cd "{your_project_path}"

# Verify current working directory
!pwd
!ls

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load data
input_data_path = '/content/drive/MyDrive/AI_Sustainability_Project_lsa/sensor_12178556_Singapore_pm25_weather_hourly_data_processed_final.csv'

print(f"--- Starting AI Modelling (Notebook 3 - FIXED) ---")
print(f"Loading pre-processed data from: {input_data_path}")

try:
    df = pd.read_csv(input_data_path, index_col='timestamp', parse_dates=True)
    print(f"Data loaded successfully. Initial shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(f"PM2.5 variance in original data: {df['pm25_value'].var():.4f}")
    print(f"PM2.5 range: {df['pm25_value'].min():.2f} to {df['pm25_value'].max():.2f}")
except Exception as e:
    print(f"Error loading data: {e}")
    raise SystemExit("Failed to load data")

In [None]:
# FIXED FEATURE ENGINEERING - COMPLETE IMPLEMENTATION
def create_comprehensive_features(data_df):
    """
    FIXED: Complete feature engineering to capture temporal patterns and prevent flat predictions.
    This addresses the root cause of straight-line predictions by creating meaningful temporal features.
    """
    print("Creating comprehensive temporal features...")
    df_featured = data_df.copy()
    
    # Ensure all numeric columns are float64 to prevent dtype issues
    numeric_cols = ['pm25_value', 'temp', 'humidity', 'wind_speed', 'precipitation']
    for col in numeric_cols:
        if col in df_featured.columns:
            df_featured[col] = pd.to_numeric(df_featured[col], errors='coerce')
    
    # 1. CRITICAL: Lag features with diverse time horizons
    print("Adding lag features...")
    lags = [1, 2, 3, 6, 12, 24, 48, 72]  
    features_to_lag = ['pm25_value', 'temp', 'humidity', 'wind_speed', 'precipitation']
    
    for feature in features_to_lag:
        if feature in df_featured.columns:
            for lag in lags:
                df_featured[f'{feature}_lag_{lag}'] = df_featured[feature].shift(lag)
    
    # 2. CRITICAL: Difference and trend features (captures change patterns)
    print("Adding trend and difference features...")
    # PM2.5 trends - these are ESSENTIAL for temporal prediction
    df_featured['pm25_diff_1h'] = df_featured['pm25_value'].diff(1)
    df_featured['pm25_diff_3h'] = df_featured['pm25_value'].diff(3)
    df_featured['pm25_diff_6h'] = df_featured['pm25_value'].diff(6)
    df_featured['pm25_diff_12h'] = df_featured['pm25_value'].diff(12)
    df_featured['pm25_diff_24h'] = df_featured['pm25_value'].diff(24)
    
    # Rate of change (percentage change)
    df_featured['pm25_pct_change_1h'] = df_featured['pm25_value'].pct_change(1)
    df_featured['pm25_pct_change_6h'] = df_featured['pm25_value'].pct_change(6)
    df_featured['pm25_pct_change_24h'] = df_featured['pm25_value'].pct_change(24)
    
    # Weather trends (using same naming convention as evaluation)
    if 'temp' in df_featured.columns:
        df_featured['temp_diff_6h'] = df_featured['temp'].diff(6)
    if 'humidity' in df_featured.columns:
        df_featured['humidity_diff_6h'] = df_featured['humidity'].diff(6)
    if 'wind_speed' in df_featured.columns:
        df_featured['wind_speed_diff_6h'] = df_featured['wind_speed'].diff(6)
    
    # 3. Rolling statistics with proper min_periods
    print("Adding rolling statistics...")
    windows = [3, 6, 12, 24, 48]
    
    for window in windows:
        min_periods = max(2, window // 3)  # Better min_periods
        
        # PM2.5 rolling features
        df_featured[f'pm25_mean_{window}h'] = df_featured['pm25_value'].rolling(window=window, min_periods=min_periods).mean()
        df_featured[f'pm25_std_{window}h'] = df_featured['pm25_value'].rolling(window=window, min_periods=min_periods).std()
        df_featured[f'pm25_min_{window}h'] = df_featured['pm25_value'].rolling(window=window, min_periods=min_periods).min()
        df_featured[f'pm25_max_{window}h'] = df_featured['pm25_value'].rolling(window=window, min_periods=min_periods).max()
        
        # Weather rolling features  
        if 'temp' in df_featured.columns:
            df_featured[f'temp_mean_{window}h'] = df_featured['temp'].rolling(window=window, min_periods=min_periods).mean()
        if 'humidity' in df_featured.columns:
            df_featured[f'humidity_mean_{window}h'] = df_featured['humidity'].rolling(window=window, min_periods=min_periods).mean()
        if 'wind_speed' in df_featured.columns:
            df_featured[f'wind_speed_mean_{window}h'] = df_featured['wind_speed'].rolling(window=window, min_periods=min_periods).mean()
    
    # 4. Volatility and variability measures
    print("Adding volatility features...")
    df_featured['pm25_volatility_12h'] = df_featured['pm25_value'].rolling(window=12, min_periods=6).std()
    df_featured['pm25_volatility_24h'] = df_featured['pm25_value'].rolling(window=24, min_periods=12).std()
    if 'temp' in df_featured.columns:
        df_featured['temp_volatility_12h'] = df_featured['temp'].rolling(window=12, min_periods=6).std()
    if 'humidity' in df_featured.columns:
        df_featured['humidity_volatility_12h'] = df_featured['humidity'].rolling(window=12, min_periods=6).std()
    
    # 5. Exponential moving averages (trend following)
    print("Adding exponential moving averages...")
    df_featured['pm25_ema_6h'] = df_featured['pm25_value'].ewm(span=6, adjust=False).mean()
    df_featured['pm25_ema_24h'] = df_featured['pm25_value'].ewm(span=24, adjust=False).mean()
    if 'temp' in df_featured.columns:
        df_featured['temp_ema_12h'] = df_featured['temp'].ewm(span=12, adjust=False).mean()
    
    # 6. Enhanced cyclical encoding
    print("Adding cyclical time features...")
    df_featured['hour_sin'] = np.sin(2 * np.pi * df_featured.index.hour / 24)
    df_featured['hour_cos'] = np.cos(2 * np.pi * df_featured.index.hour / 24)
    df_featured['day_of_week_sin'] = np.sin(2 * np.pi * df_featured.index.dayofweek / 7)
    df_featured['day_of_week_cos'] = np.cos(2 * np.pi * df_featured.index.dayofweek / 7)
    df_featured['month_sin'] = np.sin(2 * np.pi * df_featured.index.month / 12)
    df_featured['month_cos'] = np.cos(2 * np.pi * df_featured.index.month / 12)
    
    # 7. Interaction features
    print("Adding interaction features...")
    if 'wind_speed' in df_featured.columns and 'humidity' in df_featured.columns:
        df_featured['wind_humidity_interaction'] = df_featured['wind_speed'] * df_featured['humidity']
    if 'temp' in df_featured.columns and 'humidity' in df_featured.columns:
        df_featured['temp_humidity_interaction'] = df_featured['temp'] * df_featured['humidity']
    if 'wind_speed' in df_featured.columns and 'temp' in df_featured.columns:
        df_featured['wind_temp_interaction'] = df_featured['wind_speed'] * df_featured['temp']
    
    # 8. Peak and valley detection
    print("Adding peak detection features...")
    df_featured['is_pm25_peak'] = ((df_featured['pm25_value'] > df_featured['pm25_value'].shift(1)) & 
                                   (df_featured['pm25_value'] > df_featured['pm25_value'].shift(-1))).astype(int)
    df_featured['is_pm25_valley'] = ((df_featured['pm25_value'] < df_featured['pm25_value'].shift(1)) & 
                                     (df_featured['pm25_value'] < df_featured['pm25_value'].shift(-1))).astype(int)
    
    # 9. Relative position features
    print("Adding relative position features...")
    # Position relative to recent min/max
    pm25_24h_min = df_featured['pm25_value'].rolling(window=24, min_periods=12).min()
    pm25_24h_max = df_featured['pm25_value'].rolling(window=24, min_periods=12).max()
    df_featured['pm25_relative_position'] = (df_featured['pm25_value'] - pm25_24h_min) / (pm25_24h_max - pm25_24h_min + 1e-8)
    
    # 10. Hour category encoding (consistent naming)
    print("Adding categorical time features...")
    hour_bins = [0, 6, 12, 18, 24]
    hour_labels = ['night', 'morning', 'afternoon', 'evening']
    df_featured['hour_category'] = pd.cut(df_featured.index.hour, bins=hour_bins, labels=hour_labels, include_lowest=True)
    
    # One-hot encode with consistent naming
    hour_dummies = pd.get_dummies(df_featured['hour_category'], prefix='hour_cat', dtype=float)
    df_featured = pd.concat([df_featured, hour_dummies], axis=1)
    df_featured.drop('hour_category', axis=1, inplace=True)
    
    # 11. Clean up infinite and missing values
    print("Cleaning data...")
    # Replace infinite values
    df_featured = df_featured.replace([np.inf, -np.inf], np.nan)
    
    # Count initial NaNs
    initial_shape = df_featured.shape[0]
    initial_nans = df_featured.isnull().sum().sum()
    
    # Drop rows with NaNs
    df_featured.dropna(inplace=True)
    final_shape = df_featured.shape[0]
    
    # 12. CRITICAL: Ensure all columns are numeric
    print("Ensuring all features are numeric...")
    for col in df_featured.columns:
        if col != 'pm25_value':  # Keep target as is
            df_featured[col] = pd.to_numeric(df_featured[col], errors='coerce')
    
    # Final cleanup of any remaining NaNs introduced by conversion
    df_featured.dropna(inplace=True)
    final_final_shape = df_featured.shape[0]
    
    print(f"Feature engineering complete:")
    print(f"- Initial rows: {initial_shape}, Final rows: {final_final_shape}")
    print(f"- Rows dropped: {initial_shape - final_final_shape}")
    print(f"- Initial NaNs: {initial_nans}")
    print(f"- Features created: {len(df_featured.columns) - len(data_df.columns)}")
    print(f"- PM2.5 variance after features: {df_featured['pm25_value'].var():.4f}")
    
    # Verify all columns are numeric
    non_numeric = df_featured.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric:
        print(f"⚠️  WARNING: Non-numeric columns detected: {non_numeric}")
        for col in non_numeric:
            if col != 'pm25_value':
                df_featured[col] = pd.to_numeric(df_featured[col], errors='coerce')
        df_featured.dropna(inplace=True)
        print(f"✅ Converted to numeric. Final shape: {df_featured.shape}")
    else:
        print("✅ All features are numeric")
    
    return df_featured

In [None]:
# Apply feature engineering
print("\n--- Creating Comprehensive Features ---")
df_featured = create_comprehensive_features(df)
print(f"\nFinal shape after feature engineering: {df_featured.shape}")
print(f"Total features: {len(df_featured.columns)}")

In [None]:
# Chronological train/test split
print("\n--- Performing Chronological Train/Test Split ---")
train_size = int(len(df_featured) * 0.8)
train_df = df_featured.iloc[:train_size].copy()
test_df = df_featured.iloc[train_size:].copy()

print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
print(f"Train PM2.5 variance: {train_df['pm25_value'].var():.4f}")
print(f"Test PM2.5 variance: {test_df['pm25_value'].var():.4f}")

# Define feature columns
features_for_scaling = [col for col in train_df.columns if col != 'pm25_value' and 'target' not in col]
print(f"Number of features for modeling: {len(features_for_scaling)}")

In [None]:
# Feature scaling
print("\n--- Scaling Features ---")
scaler_x = StandardScaler()  # StandardScaler often works better than MinMaxScaler for complex features
train_df[features_for_scaling] = scaler_x.fit_transform(train_df[features_for_scaling])
test_df[features_for_scaling] = scaler_x.transform(test_df[features_for_scaling])

joblib.dump(scaler_x, '/content/drive/MyDrive/AI_Sustainability_Project_lsa/scaler_x.pkl')
print("Features scaled and scaler saved.")

In [None]:
# Model training loop
horizons = [1, 3, 6, 12, 24]
print(f"\n--- Training Models for Horizons: {horizons} ---")

for h in horizons:
    print(f"\n=== Processing Horizon: {h} hours ===")
    
    # Create target
    train_df['target_h'] = train_df['pm25_value'].shift(-h)
    test_df['target_h'] = test_df['pm25_value'].shift(-h)
    
    # Drop NaN targets
    train_h = train_df.dropna(subset=['target_h'])
    test_h = test_df.dropna(subset=['target_h'])
    
    X_train = train_h[features_for_scaling]
    y_train = train_h['target_h']
    X_test = test_h[features_for_scaling]
    y_test = test_h['target_h']
    
    print(f"Training shapes: X_train {X_train.shape}, y_train {y_train.shape}")
    print(f"Target variance: {y_train.var():.4f} (good if > 1.0)")
    print(f"Target range: {y_train.min():.2f} to {y_train.max():.2f}")
    
    # CRITICAL: Verify data types before training
    print("Verifying data types...")
    non_numeric_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()
    if non_numeric_features:
        print(f"⚠️  Converting non-numeric features: {non_numeric_features}")
        for col in non_numeric_features:
            X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
            X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
        # Drop any rows with NaN after conversion
        X_train.dropna(inplace=True)
        y_train = y_train.loc[X_train.index]
        X_test.dropna(inplace=True)
        y_test = y_test.loc[X_test.index]
        print(f"✅ After cleanup: X_train {X_train.shape}, y_train {y_train.shape}")
    
    # Ensure all data is float32 for TensorFlow
    X_train = X_train.astype(np.float32)
    y_train = y_train.astype(np.float32)
    X_test = X_test.astype(np.float32)
    y_test = y_test.astype(np.float32)
    
    # Target scaling for LSTM
    scaler_y = StandardScaler()
    y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten().astype(np.float32)
    joblib.dump(scaler_y, f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/scaler_y_h{h}.pkl')
    
    # Save feature names for evaluation consistency
    feature_names = X_train.columns.tolist()
    joblib.dump(feature_names, f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/feature_names_h{h}.pkl')
    print(f"✅ Saved {len(feature_names)} feature names for evaluation consistency")
    
    # Cross-validation setup
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Random Forest
    print(f"\nTraining Random Forest...")
    rf_params = {
        'n_estimators': [150, 200, 300],
        'max_depth': [15, 20, 25, None],
        'min_samples_leaf': [1, 2, 3],
        'max_features': ['sqrt', 'log2', 0.8]
    }
    
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    rf_search = RandomizedSearchCV(
        rf, rf_params, cv=tscv, scoring='neg_mean_squared_error', 
        n_iter=12, verbose=0, random_state=42
    )
    rf_search.fit(X_train, y_train)
    
    joblib.dump(rf_search.best_estimator_, f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/rf_model_h{h}.pkl')
    print(f"RF Best RMSE: {np.sqrt(-rf_search.best_score_):.4f}")
    print(f"RF Best params: {rf_search.best_params_}")
    
    # LSTM with improved architecture and error handling
    print(f"\nTraining LSTM...")
    try:
        # Ensure data is properly shaped and typed for LSTM
        X_train_lstm = X_train.values.astype(np.float32).reshape(X_train.shape[0], 1, X_train.shape[1])
        
        print(f"LSTM input shape: {X_train_lstm.shape}")
        print(f"LSTM target shape: {y_train_scaled.shape}")
        print(f"Data types: X={X_train_lstm.dtype}, y={y_train_scaled.dtype}")
        
        # Clear any previous models
        tf.keras.backend.clear_session()
        
        model_lstm = Sequential([
            LSTM(128, return_sequences=True, input_shape=(1, X_train.shape[1])),
            BatchNormalization(),
            Dropout(0.3),
            LSTM(64, return_sequences=True),
            BatchNormalization(), 
            Dropout(0.3),
            LSTM(32),
            BatchNormalization(),
            Dropout(0.2),
            Dense(16, activation='relu'),
            Dense(1)
        ])
        
        model_lstm.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
            loss='mse',
            metrics=['mae']
        )
        
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6)
        ]
        
        history = model_lstm.fit(
            X_train_lstm, y_train_scaled,
            epochs=200,
            batch_size=64,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=1
        )
        
        model_lstm.save(f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/lstm_model_h{h}.keras')
        print(f"✅ LSTM trained successfully. Best val_loss: {min(history.history['val_loss']):.4f}")
        
        # Clean up for next iteration
        del X_train_lstm, model_lstm, history
        tf.keras.backend.clear_session()
        
    except Exception as e:
        print(f"❌ LSTM training failed for horizon {h}h: {e}")
        print(f"Skipping LSTM for this horizon and continuing...")
    
    # Clean up target column for next iteration
    train_df.drop('target_h', axis=1, inplace=True, errors='ignore')
    test_df.drop('target_h', axis=1, inplace=True, errors='ignore')

In [None]:
# Save featured data
train_df.to_csv('/content/drive/MyDrive/AI_Sustainability_Project_lsa/train_featured_data.csv')
test_df.to_csv('/content/drive/MyDrive/AI_Sustainability_Project_lsa/test_featured_data.csv')

print("\n=== AI Modelling Complete ===")
print("✅ Enhanced models with comprehensive temporal features trained")
print("✅ This SHOULD resolve the straight-line prediction issue")
print("✅ Models now have rich temporal context and variability")
print("\nNext: Run Notebook 4 for evaluation - expect realistic PM2.5 predictions!")