# Notebook 3: AI Modelling
## Introduction
# Loads data from Notebook 1, adds features, trains RF and LSTM for horizons 1,3,6,12,24h.
# Justification: RF for non-linear feature importance; LSTM for temporal sequences. Horizons align with real-time forecasting needs. TimeSeriesSplit prevents data leakage.


In [None]:
# Mount Google Drive
from google.colab import drive
import os

# Mount your Google Drive
drive.mount('/content/drive')

# Define your project folder in Google Drive
your_project_path = '/content/drive/My Drive/AI_Sustainability_Project_lsa'

# Create the project directory if it doesn't exist
os.makedirs(your_project_path, exist_ok=True)
print(f"Project path set to: {your_project_path}")

# Change current working directory to your project path
%cd "{your_project_path}"

# Verify current working directory
!pwd
!ls

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')  # Suppress SettingWithCopyWarning for cleaner output

# Mount Google Drive (essential for Colab to access saved files)
from google.colab import drive
drive.mount('/content/drive')

# Define the full path for the processed data generated by Notebook 1
input_data_path = '/content/drive/MyDrive/AI_Sustainability_Project_lsa/sensor_12178556_Singapore_pm25_weather_hourly_data_processed_final.csv'

print(f"--- Starting AI Modelling (Notebook 3) ---")
print(f"Loading pre-processed data from: {input_data_path}")

try:
    # Load the processed data from Google Drive
    df = pd.read_csv(input_data_path, index_col='timestamp', parse_dates=True)
    print(f"Data loaded successfully. Initial shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
except Exception as e:
    print(f"Error loading data from {input_data_path}: {e}")
    print("Please ensure Notebook 1 has run successfully and the file exists at the specified path in Google Drive.")
    raise SystemExit("Failed to load pre-processed data. Aborting Notebook 3 execution.")


# --- FIX: STEP 1: Feature Engineering on the entire DataFrame first ---
def add_features(data_df):
    """This function now runs on the whole dataset to ensure continuity of features."""
    df_featured = data_df.copy() # Work on a copy
    lags = [1, 3, 6, 12, 24, 48]
    features_to_lag = ['pm25_value', 'temp', 'humidity', 'wind_speed', 'precipitation']
    for feature in features_to_lag:
        for lag in lags:
            df_featured[f'{feature}_lag_{lag}'] = df_featured[feature].shift(lag)

    df_featured['pm25_rolling_24'] = df_featured['pm25_value'].rolling(window=24, min_periods=1).mean()
    df_featured['temp_rolling_24'] = df_featured['temp'].rolling(window=24, min_periods=1).mean()

    df_featured['hour_sin'] = np.sin(2 * np.pi * df_featured.index.hour / 24)
    df_featured['hour_cos'] = np.cos(2 * np.pi * df_featured.index.hour / 24)
    df_featured['day_of_week_sin'] = np.sin(2 * np.pi * df_featured.index.dayofweek / 7)
    df_featured['day_of_week_cos'] = np.cos(2 * np.pi * df_featured.index.dayofweek / 7)

    df_featured['wind_humidity_interaction'] = df_featured['wind_speed'] * df_featured['humidity']

    # Drop rows with NaNs that were created by the lag/roll operations at the beginning
    df_featured.dropna(inplace=True)
    return df_featured

print("\n--- Adding Features to the entire dataset --")
df_featured = add_features(df)
print(f"Shape after features and cleaning: {df_featured.shape}")


# --- FIX: STEP 2: Chronological Train/Test Split on the *featured* DataFrame ---
print("\n--- Performing Chronological Train/Test Split --")
train_size = int(len(df_featured) * 0.8)
train_df = df_featured.iloc[:train_size].copy()
test_df = df_featured.iloc[train_size:].copy()
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")


# Define features list (it now includes the engineered features)
features_for_scaling = [col for col in train_df.columns if col != 'pm25_value' and 'target' not in col]


# --- FIX: STEP 3: Scaling (Fit on Train, Transform Both) - This part was already correct ---
print("\n--- Scaling Features (MinMaxScaler on Train) ---")
scaler_x = MinMaxScaler()
train_df[features_for_scaling] = scaler_x.fit_transform(train_df[features_for_scaling])
test_df[features_for_scaling] = scaler_x.transform(test_df[features_for_scaling])
joblib.dump(scaler_x, '/content/drive/MyDrive/AI_Sustainability_Project_lsa/scaler_x.pkl')
print(f"Features scaled. Scaler saved to '/content/drive/MyDrive/AI_Sustainability_Project_lsa/scaler_x.pkl'.")


# --- Model Training Loop for Multiple Horizons (Logic remains the same) ---
horizons = [1, 3, 6, 12, 24]
print(f"\n--- Training Models for Horizons: {horizons} ---")

for h in horizons:
    print(f"\n--- Processing Horizon: {h} hours ---")

    # Create target variable by shifting the original pm25_value
    train_df['target_h'] = train_df['pm25_value'].shift(-h)
    test_df['target_h'] = test_df['pm25_value'].shift(-h)

    # Drop rows where the shifted target is now NaN (at the end of each dataframe)
    train_h = train_df.dropna(subset=['target_h'])
    test_h = test_df.dropna(subset=['target_h'])

    X_train = train_h[features_for_scaling]
    y_train = train_h['target_h']
    X_test = test_h[features_for_scaling]
    y_test = test_h['target_h']

    # Scale target for LSTM only
    scaler_y = MinMaxScaler()
    y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
    joblib.dump(scaler_y, f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/scaler_y_h{h}.pkl')
    print(f"Target scaled for LSTM (horizon {h}). Scaler saved.")

    # Time Series Split for Cross-Validation
    tscv = TimeSeriesSplit(n_splits=5)

    # Random Forest Regressor
    print(f"Training RandomForestRegressor for horizon {h}...")
    rf = RandomForestRegressor(random_state=42, n_jobs=-1)
    param_dist_rf = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 15, 20, None],
        'min_samples_leaf': [1, 2, 4]
    }
    rf_search = RandomizedSearchCV(rf, param_dist_rf, cv=tscv, scoring='neg_mean_squared_error', n_iter=5, verbose=0, random_state=42)
    rf_search.fit(X_train, y_train)
    joblib.dump(rf_search.best_estimator_, f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/rf_model_h{h}.pkl')
    print(f"RF trained and saved. Best params: {rf_search.best_params_}")

    # LSTM
    print(f"Training LSTM for horizon {h}...")
    X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
    model_lstm = Sequential([
        LSTM(100, activation='relu', input_shape=(1, X_train.shape[1]), return_sequences=True),
        Dropout(0.2),
        LSTM(50, activation='relu'),
        Dropout(0.2),
        Dense(1)
    ])
    model_lstm.compile(optimizer='adam', loss='mse')
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    history = model_lstm.fit(
        X_train_lstm, y_train_scaled,
        epochs=100,
        batch_size=32,
        validation_split=0.2, # Uses the last 20% of training data for validation
        callbacks=[early_stop],
        verbose=0
    )
    model_lstm.save(f'/content/drive/MyDrive/AI_Sustainability_Project_lsa/lstm_model_h{h}.keras')
    print(f"LSTM trained and saved. Best val_loss: {min(history.history['val_loss']):.4f}")

# Save the final featured dataframes for evaluation in the next notebook
train_df.to_csv('/content/drive/MyDrive/AI_Sustainability_Project_lsa/train_featured_data.csv')
test_df.to_csv('/content/drive/MyDrive/AI_Sustainability_Project_lsa/test_featured_data.csv')
print("\nTrain and Test featured data saved for evaluation/compression in later notebooks.")

print("\n--- AI Modelling Complete ---")
print("Trained models and scalers saved. Proceed to Notebook 4 for evaluation.")

In [None]:

# Save scalers and models; add GRU training
from sklearn.externals import joblib if False else None
import joblib

# Target scaler (fit on training pm25_value only)
scaler_y = MinMaxScaler()
scaler_y.fit(train[['pm25_value']].values)
joblib.dump(scaler, 'feature_scaler.pkl')
joblib.dump(scaler_y, 'target_scaler.pkl')

# Train GRU
gru = models.Sequential([
    layers.Input(shape=(lookback, Xtr.shape[-1])),
    layers.GRU(64, return_sequences=False),
    layers.Dropout(0.2),
    layers.Dense(1)
])
gru.compile(optimizer='adam', loss='mse')
h_gru = gru.fit(Xtr, ytr, validation_data=(Xva, yva), epochs=10, batch_size=64, verbose=0)

# Save horizon-specific models (assuming 1-step ahead with lookback window)
lstm.save('lstm_model_h24.keras')
gru.save('gru_model_h24.keras')

# Pick best based on validation loss
val_lstm = (h.history['val_loss'][-1] if 'val_loss' in h.history else 1e9)
val_gru = (h_gru.history['val_loss'][-1] if 'val_loss' in h_gru.history else 1e9)
(best_lstm if val_lstm <= val_gru else gru).save('best_lstm.keras')
print('Saved: feature_scaler.pkl, target_scaler.pkl, lstm_model_h24.keras, gru_model_h24.keras, best_lstm.keras')
