# Notebook 3: AI Modelling
## Introduction
# Loads data from Notebook 1, adds features, trains RF and LSTM for horizons 1,3,6,12,24h.
# Justification: RF for non-linear feature importance; LSTM for temporal sequences. Horizons align with real-time forecasting needs. TimeSeriesSplit prevents data leakage.


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('/content/drive/MyDrive/sus-lsa/sensor_12178556_Singapore_pm25_weather_hourly_data_processed_capped.csv', index_col=0, parse_dates=True)

# Feature Engineering
lags = [1, 3, 6, 12, 24]
for lag in lags:
    df[f'pm25_lag_{lag}'] = df['pm25_value'].shift(lag)
    df[f'temp_lag_{lag}'] = df['temp'].shift(lag)  # Similar for other weather vars
# Add rolling means, cyclical time, interactions (e.g., humidity * temp)
df['pm25_rolling_24'] = df['pm25_value'].rolling(24).mean()  # Justification: Captures daily trends.
df['hour_sin'] = np.sin(2 * np.pi * df.index.hour / 24)
df['humidity_temp_interact'] = df['humidity'] * df['temp']
df.dropna(inplace=True)

# Scaling for features
scaler_x = MinMaxScaler()
features = [col for col in df.columns if col != 'pm25_value']
df[features] = scaler_x.fit_transform(df[features])
joblib.dump(scaler_x, '/content/drive/MyDrive/sus-lsa/scaler_x.pkl')

# Modeling loop
horizons = [1, 3, 6, 12, 24]
compression_results = []  # To store results
for h in horizons:
    df['target'] = df['pm25_value'].shift(-h)
    df_h = df.dropna()
    X = df_h[features]
    scaler_y = MinMaxScaler()
    y = scaler_y.fit_transform(df_h[['target']])
    joblib.dump(scaler_y, f'/content/drive/MyDrive/sus-lsa/scaler_y_h{h}.pkl')
    tscv = TimeSeriesSplit(n_splits=5)  # Justification: Preserves temporal order.

    # RF with expanded params
    rf = RandomForestRegressor(random_state=42)
    param_dist = {'n_estimators': [50,100,200,300], 'max_depth': [5,10,15,20,None], 'min_samples_leaf': [1,2,4]}
    search = RandomizedSearchCV(rf, param_dist, cv=tscv, scoring='neg_mean_squared_error', n_iter=10)
    search.fit(X, y.ravel())
    joblib.dump(search.best_estimator_, f'/content/drive/MyDrive/sus-lsa/rf_model_h{h}.pkl')

    # LSTM with implementation=1, validation_split, and y scaled
    X_lstm = np.reshape(X.values, (X.shape[0], 1, X.shape[1]))
    model = Sequential([LSTM(50, input_shape=(1, X.shape[1]), implementation=1), Dense(1)])
    model.compile(optimizer='adam', loss='mse')
    early_stop = EarlyStopping(monitor='val_loss', patience=5)  # Monitor val_loss
    model.fit(X_lstm, y, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stop])
    model.save(f'/content/drive/MyDrive/sus-lsa/lstm_model_h{h}.h5')

df.to_csv('/content/drive/MyDrive/sus-lsa/featured_data.csv')  # Save for next notebooks

  super().__init__(**kwargs)


Epoch 1/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 0.2901 - val_loss: 0.0015
Epoch 2/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0032 - val_loss: 6.8941e-04
Epoch 3/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0025 - val_loss: 7.7678e-04
Epoch 4/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0028 - val_loss: 6.2688e-04
Epoch 5/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0026 - val_loss: 7.5059e-04
Epoch 6/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0026 - val_loss: 5.7756e-04
Epoch 7/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0029 - val_loss: 4.0442e-04
Epoch 8/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0020 - val_loss: 3.8912e-04
Epoch 9/50




Epoch 1/50


  super().__init__(**kwargs)


[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 0.3025 - val_loss: 0.0021
Epoch 2/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0066 - val_loss: 0.0010
Epoch 3/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0037 - val_loss: 7.4098e-04
Epoch 4/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0036 - val_loss: 5.0272e-04
Epoch 5/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0026 - val_loss: 4.2452e-04
Epoch 6/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0025 - val_loss: 3.7528e-04
Epoch 7/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0024 - val_loss: 3.4799e-04
Epoch 8/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0033 - val_loss: 3.5844e-04
Epoch 9/50
[1m101/101[0m 



Epoch 1/50


  super().__init__(**kwargs)


[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 0.2920 - val_loss: 0.0032
Epoch 2/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0058 - val_loss: 8.6683e-04
Epoch 3/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0045 - val_loss: 6.8997e-04
Epoch 4/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0036 - val_loss: 6.1062e-04
Epoch 5/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0036 - val_loss: 5.0118e-04
Epoch 6/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0027 - val_loss: 4.8419e-04
Epoch 7/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0027 - val_loss: 5.7713e-04
Epoch 8/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0034 - val_loss: 3.3875e-04
Epoch 9/50
[1m101/101



Epoch 1/50


  super().__init__(**kwargs)


[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.3423 - val_loss: 0.0012
Epoch 2/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0040 - val_loss: 6.7129e-04
Epoch 3/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0032 - val_loss: 6.0174e-04
Epoch 4/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0023 - val_loss: 3.9444e-04
Epoch 5/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0026 - val_loss: 4.6506e-04
Epoch 6/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0044 - val_loss: 2.7141e-04
Epoch 7/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0032 - val_loss: 2.7678e-04
Epoch 8/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0037 - val_loss: 2.5101e-04
Epoch 9/50
[1m101/101



Epoch 1/50


  super().__init__(**kwargs)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.4692 - val_loss: 0.0021
Epoch 2/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0049 - val_loss: 8.8560e-04
Epoch 3/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0032 - val_loss: 5.5685e-04
Epoch 4/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0032 - val_loss: 4.4574e-04
Epoch 5/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0038 - val_loss: 4.6709e-04
Epoch 6/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0039 - val_loss: 3.8511e-04
Epoch 7/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0019 - val_loss: 3.5772e-04
Epoch 8/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0025 - val_loss: 3.3622e-04
Epoch 9/50
[1m100/100

