# Notebook 3: AI Modelling (Improved Feature Engineering)

## Introduction

Loads data from Notebook 1, adds ENHANCED features, trains RF and LSTM for horizons 1,3,6,12,24h.

Justification: Enhanced temporal features including trends, differences, and better lag patterns to capture PM2.5 dynamics properly and avoid straight-line predictions.

In [None]:
# Mount Google Drive
from google.colab import drive
import os

# Mount your Google Drive
drive.mount('/content/drive')

# Define your project folder in Google Drive
your_project_path = '/content/drive/My Drive/AI_Sustainability_Project_lsa'

# Create the project directory if it doesn't exist
os.makedirs(your_project_path, exist_ok=True)
print(f"Project path set to: {your_project_path}")

# Change current working directory to your project path
%cd "{your_project_path}"

# Verify current working directory
!pwd
!ls

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')  # Suppress SettingWithCopyWarning for cleaner output

In [None]:
# Define the full path for the processed data generated by Notebook 1
input_data_path = '/content/drive/MyDrive/AI_Sustainability_Project_lsa/sensor_12178556_Singapore_pm25_weather_hourly_data_processed_final.csv'

print(f"--- Starting AI Modelling (Notebook 3) ---")
print(f"Loading pre-processed data from: {input_data_path}")

try:
    # Load the processed data from Google Drive
    df = pd.read_csv(input_data_path, index_col='timestamp', parse_dates=True)
    print(f"Data loaded successfully. Initial shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
except Exception as e:
    print(f"Error loading data from {input_data_path}: {e}")
    print("Please ensure Notebook 1 has run successfully and the file exists at the specified path in Google Drive.")
    raise SystemExit("Failed to load pre-processed data. Aborting Notebook 3 execution.")

In [None]:
# --- IMPROVED FEATURE ENGINEERING ---
def add_enhanced_features(data_df):
    """
    Enhanced feature engineering to capture temporal patterns and avoid flat predictions.
    This version includes trends, differences, better lag patterns, and temporal statistics.
    """
    df_featured = data_df.copy()
    
    print("Adding enhanced temporal features...")
    
    # 1. Basic lag features with better selection
    lags = [1, 2, 3, 6, 12, 24, 48, 72]  # More comprehensive lag structure
    features_to_lag = ['pm25_value', 'temp', 'humidity', 'wind_speed', 'precipitation']
    
    for feature in features_to_lag:
        for lag in lags:
            df_featured[f'{feature}_lag_{lag}'] = df_featured[feature].shift(lag)
    
    # 2. Trend and difference features (CRITICAL for temporal modeling)
    # Short-term trends
    df_featured['pm25_trend_3h'] = df_featured['pm25_value'] - df_featured['pm25_value'].shift(3)
    df_featured['pm25_trend_6h'] = df_featured['pm25_value'] - df_featured['pm25_value'].shift(6)
    df_featured['pm25_trend_12h'] = df_featured['pm25_value'] - df_featured['pm25_value'].shift(12)
    df_featured['pm25_trend_24h'] = df_featured['pm25_value'] - df_featured['pm25_value'].shift(24)
    
    # Weather trends
    df_featured['temp_trend_6h'] = df_featured['temp'] - df_featured['temp'].shift(6)
    df_featured['humidity_trend_6h'] = df_featured['humidity'] - df_featured['humidity'].shift(6)
    df_featured['wind_speed_trend_6h'] = df_featured['wind_speed'] - df_featured['wind_speed'].shift(6)
    
    # 3. Enhanced rolling statistics with multiple windows
    windows = [3, 6, 12, 24, 48]
    
    for window in windows:
        # PM2.5 rolling features
        df_featured[f'pm25_mean_{window}h'] = df_featured['pm25_value'].rolling(window=window, min_periods=max(1, window//2)).mean()
        df_featured[f'pm25_std_{window}h'] = df_featured['pm25_value'].rolling(window=window, min_periods=max(1, window//2)).std()
        df_featured[f'pm25_min_{window}h'] = df_featured['pm25_value'].rolling(window=window, min_periods=max(1, window//2)).min()
        df_featured[f'pm25_max_{window}h'] = df_featured['pm25_value'].rolling(window=window, min_periods=max(1, window//2)).max()
        
        # Weather rolling features
        df_featured[f'temp_mean_{window}h'] = df_featured['temp'].rolling(window=window, min_periods=max(1, window//2)).mean()
        df_featured[f'humidity_mean_{window}h'] = df_featured['humidity'].rolling(window=window, min_periods=max(1, window//2)).mean()
        df_featured[f'wind_speed_mean_{window}h'] = df_featured['wind_speed'].rolling(window=window, min_periods=max(1, window//2)).mean()
    
    # 4. Enhanced cyclical encoding
    df_featured['hour_sin'] = np.sin(2 * np.pi * df_featured.index.hour / 24)
    df_featured['hour_cos'] = np.cos(2 * np.pi * df_featured.index.hour / 24)
    df_featured['day_of_week_sin'] = np.sin(2 * np.pi * df_featured.index.dayofweek / 7)
    df_featured['day_of_week_cos'] = np.cos(2 * np.pi * df_featured.index.dayofweek / 7)
    df_featured['month_sin'] = np.sin(2 * np.pi * df_featured.index.month / 12)
    df_featured['month_cos'] = np.cos(2 * np.pi * df_featured.index.month / 12)
    
    return df_featured

In [None]:
    # 5. Interaction features
    df_featured['wind_humidity_interaction'] = df_featured['wind_speed'] * df_featured['humidity']
    df_featured['temp_humidity_interaction'] = df_featured['temp'] * df_featured['humidity']
    df_featured['wind_temp_interaction'] = df_featured['wind_speed'] * df_featured['temp']
    
    # 6. Volatility and rate of change features
    df_featured['pm25_volatility_24h'] = df_featured['pm25_value'].rolling(window=24, min_periods=12).std()
    df_featured['pm25_roc_1h'] = df_featured['pm25_value'].pct_change(periods=1)
    df_featured['pm25_roc_6h'] = df_featured['pm25_value'].pct_change(periods=6)
    df_featured['pm25_roc_24h'] = df_featured['pm25_value'].pct_change(periods=24)
    
    # 7. Weather variability
    df_featured['temp_variability_12h'] = df_featured['temp'].rolling(window=12, min_periods=6).std()
    df_featured['humidity_variability_12h'] = df_featured['humidity'].rolling(window=12, min_periods=6).std()
    df_featured['wind_variability_12h'] = df_featured['wind_speed'].rolling(window=12, min_periods=6).std()
    
    # 8. Categorical time features
    df_featured['hour_category'] = pd.cut(df_featured.index.hour, 
                                         bins=[0, 6, 12, 18, 24], 
                                         labels=['night', 'morning', 'afternoon', 'evening'],
                                         include_lowest=True)
    
    # One-hot encode hour categories
    hour_dummies = pd.get_dummies(df_featured['hour_category'], prefix='hour_cat')
    df_featured = pd.concat([df_featured, hour_dummies], axis=1)
    df_featured.drop('hour_category', axis=1, inplace=True)
    
    # 9. Peak detection features
    df_featured['is_pm25_local_peak'] = ((df_featured['pm25_value'] > df_featured['pm25_value'].shift(1)) & 
                                        (df_featured['pm25_value'] > df_featured['pm25_value'].shift(-1))).astype(int)
    
    # 10. Exponential moving averages (better for trend following)
    df_featured['pm25_ema_6h'] = df_featured['pm25_value'].ewm(span=6).mean()
    df_featured['pm25_ema_24h'] = df_featured['pm25_value'].ewm(span=24).mean()
    df_featured['temp_ema_12h'] = df_featured['temp'].ewm(span=12).mean()
    
    # Fill infinite values and replace with NaN
    df_featured = df_featured.replace([np.inf, -np.inf], np.nan)
    
    # Drop rows with NaNs created by feature engineering
    initial_shape = df_featured.shape[0]
    df_featured.dropna(inplace=True)
    final_shape = df_featured.shape[0]
    
    print(f"Feature engineering complete. Dropped {initial_shape - final_shape} rows with NaN values.")
    print(f"Total features created: {len(df_featured.columns) - len(data_df.columns)}")
    
    return df_featured

In [None]:
print("\n--- Adding Enhanced Features to the entire dataset ---")
df_featured = add_enhanced_features(df)
print(f"Shape after enhanced features and cleaning: {df_featured.shape}")
print(f"New feature count: {len(df_featured.columns)}")

In [None]:
# --- Chronological Train/Test Split on the *featured* DataFrame ---
print("\n--- Performing Chronological Train/Test Split ---")
train_size = int(len(df_featured) * 0.8)
train_df = df_featured.iloc[:train_size].copy()
test_df = df_featured.iloc[train_size:].copy()
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

# Define features list (exclude target and original pm25_value)
features_for_scaling = [col for col in train_df.columns if col != 'pm25_value' and 'target' not in col]
print(f"Number of features for modeling: {len(features_for_scaling)}")

In [None]:
# --- Feature Scaling ---
print("\n--- Scaling Features (MinMaxScaler on Train) ---")
scaler_x = MinMaxScaler()
train_df[features_for_scaling] = scaler_x.fit_transform(train_df[features_for_scaling])
test_df[features_for_scaling] = scaler_x.transform(test_df[features_for_scaling])
joblib.dump(scaler_x, '/content/drive/MyDrive/AI_Sustainability_Project_lsa/scaler_x.pkl')
print(f"Features scaled. Scaler saved.")

In [None]:
# --- Model Training Loop for Multiple Horizons ---
horizons = [1, 3, 6, 12, 24]
print(f"\n--- Training Models for Horizons: {horizons} ---")

for h in horizons:
    print(f"\n--- Processing Horizon: {h} hours ---")
    
    # Create target variable by shifting the original pm25_value
    train_df['target_h'] = train_df['pm25_value'].shift(-h)
    test_df['target_h'] = test_df['pm25_value'].shift(-h)
    
    # Drop rows where the shifted target is now NaN
    train_h = train_df.dropna(subset=['target_h'])
    test_h = test_df.dropna(subset=['target_h'])
    
    X_train = train_h[features_for_scaling]
    y_train = train_h['target_h']
    X_test = test_h[features_for_scaling]
    y_test = test_h['target_h']
    
    print(f"Training data shape: X_train {X_train.shape}, y_train {y_train.shape}")
    print(f"Target variance: {y_train.var():.4f} (should be > 0.1 for meaningful predictions)")
    
    # Scale target for LSTM only
    scaler_y = MinMaxScaler()
    y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
    joblib.dump(scaler_y, f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/scaler_y_h{h}.pkl')
    print(f"Target scaled for LSTM (horizon {h}). Scaler saved.")
    
    # Time Series Split for Cross-Validation
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Random Forest Regressor
    print(f"Training RandomForestRegressor for horizon {h}...")
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    param_dist_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20, None],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }
    rf_search = RandomizedSearchCV(rf, param_dist_rf, cv=tscv, scoring='neg_mean_squared_error', n_iter=10, verbose=0, random_state=42)
    rf_search.fit(X_train, y_train)
    joblib.dump(rf_search.best_estimator_, f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/rf_model_h{h}.pkl')
    print(f"RF trained and saved. Best params: {rf_search.best_params_}")
    
    # LSTM with improved architecture
    print(f"Training LSTM for horizon {h}...")
    X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
    
    # Enhanced LSTM architecture
    model_lstm = Sequential([
        LSTM(128, activation='relu', input_shape=(1, X_train.shape[1]), return_sequences=True),
        Dropout(0.3),
        LSTM(64, activation='relu', return_sequences=True),
        Dropout(0.3),
        LSTM(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    
    model_lstm.compile(optimizer='adam', loss='mse', metrics=['mae'])
    early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
    
    history = model_lstm.fit(
        X_train_lstm, y_train_scaled,
        epochs=150,
        batch_size=64,
        validation_split=0.2,
        callbacks=[early_stop],
        verbose=1
    )
    
    model_lstm.save(f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/lstm_model_h{h}.keras')
    print(f"LSTM trained and saved. Best val_loss: {min(history.history['val_loss']):.4f}")
    
    # Clean up target column for next iteration
    train_df.drop('target_h', axis=1, inplace=True, errors='ignore')
    test_df.drop('target_h', axis=1, inplace=True, errors='ignore')

In [None]:
# Save the final featured dataframes for evaluation in the next notebook
train_df.to_csv('/content/drive/MyDrive/AI_Sustainability_Project_lsa/train_featured_data.csv')
test_df.to_csv('/content/drive/MyDrive/AI_Sustainability_Project_lsa/test_featured_data.csv')
print("\nTrain and Test featured data saved for evaluation/compression in later notebooks.")

print("\n--- AI Modelling Complete ---")
print("Enhanced models with improved temporal features trained and saved.")
print("This should resolve the straight-line prediction issue in Notebook 4.")