# Notebook 3: AI Modelling
## Introduction
# Loads data from Notebook 1, adds features, trains RF and LSTM for horizons 1,3,6,12,24h.
# Justification: RF for non-linear feature importance; LSTM for temporal sequences. Horizons align with real-time forecasting needs. TimeSeriesSplit prevents data leakage.


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

df = pd.read_csv('/content/drive/MyDrive/sus-lsa/sensor_12178556_Singapore_pm25_weather_hourly_data_processed_capped.csv', index_col=0, parse_dates=True)

# Feature Engineering
lags = [1, 3, 6, 12, 24]
for lag in lags:
    df[f'pm25_lag_{lag}'] = df['pm25_value'].shift(lag)
    df[f'temp_lag_{lag}'] = df['temp'].shift(lag)  # Similar for other weather vars
# Add rolling means, cyclical time, interactions (e.g., humidity * temp)
df['pm25_rolling_24'] = df['pm25_value'].rolling(24).mean()  # Justification: Captures daily trends.
df['hour_sin'] = np.sin(2 * np.pi * df.index.hour / 24)
df['humidity_temp_interact'] = df['humidity'] * df['temp']

# Clean NaN values properly
print(f"Shape before cleaning: {df.shape}")
print(f"NaN counts before cleaning:\n{df.isnull().sum()}")
df.dropna(inplace=True)
print(f"Shape after cleaning: {df.shape}")
print(f"NaN counts after cleaning:\n{df.isnull().sum()}")

# Verify no infinite values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
print(f"Final shape: {df.shape}")

# Scaling for features
scaler_x = MinMaxScaler()
features = [col for col in df.columns if col != 'pm25_value']
df[features] = scaler_x.fit_transform(df[features])
joblib.dump(scaler_x, '/content/drive/MyDrive/sus-lsa/scaler_x.pkl')

# Modeling loop
horizons = [1, 3, 6, 12, 24]
compression_results = []  # To store results
for h in horizons:
    df['target'] = df['pm25_value'].shift(-h)
    df_h = df.dropna()

    # Additional check for NaN values
    print(f"Horizon {h}: Shape before final check: {df_h.shape}")
    print(f"Horizon {h}: NaN counts: {df_h.isnull().sum().sum()}")

    X = df_h[features]
    scaler_y = MinMaxScaler()
    y = scaler_y.fit_transform(df_h[['target']])

    # Verify no NaN in X and y
    if X.isnull().any().any() or np.isnan(y).any():
        print(f"Warning: NaN found in horizon {h} data!")
        continue

    joblib.dump(scaler_y, f'/content/drive/MyDrive/sus-lsa/scaler_y_h{h}.pkl')
    tscv = TimeSeriesSplit(n_splits=5)  # Justification: Preserves temporal order.

    # RF with expanded params
    rf = RandomForestRegressor(random_state=42)
    param_dist = {'n_estimators': [50,100,200,300], 'max_depth': [5,10,15,20,None], 'min_samples_leaf': [1,2,4]}
    search = RandomizedSearchCV(rf, param_dist, cv=tscv, scoring='neg_mean_squared_error', n_iter=10)
    search.fit(X, y.ravel())
    joblib.dump(search.best_estimator_, f'/content/drive/MyDrive/sus-lsa/rf_model_h{h}.pkl')

    # LSTM with implementation=1, validation_split, and y scaled
    X_lstm = np.reshape(X.values, (X.shape[0], 1, X.shape[1]))
    model = Sequential([LSTM(50, input_shape=(1, X.shape[1]), implementation=1), Dense(1)])
    model.compile(optimizer='adam', loss='mse')
    early_stop = EarlyStopping(monitor='val_loss', patience=5)  # Monitor val_loss
    model.fit(X_lstm, y, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stop])
    model.save(f'/content/drive/MyDrive/sus-lsa/lstm_model_h{h}.h5')

df.to_csv('/content/drive/MyDrive/sus-lsa/featured_data.csv')  # Save for next notebooks

Shape before cleaning: (4049, 19)
NaN counts before cleaning:
pm25_value                 0
temp                       0
humidity                   0
wind_speed                 0
wind_dir                   0
precipitation              0
pm25_lag_1                 1
temp_lag_1                 1
pm25_lag_3                 3
temp_lag_3                 3
pm25_lag_6                 6
temp_lag_6                 6
pm25_lag_12               12
temp_lag_12               12
pm25_lag_24               24
temp_lag_24               24
pm25_rolling_24           23
hour_sin                   0
humidity_temp_interact     0
dtype: int64
Shape after cleaning: (4025, 19)
NaN counts after cleaning:
pm25_value                0
temp                      0
humidity                  0
wind_speed                0
wind_dir                  0
precipitation             0
pm25_lag_1                0
temp_lag_1                0
pm25_lag_3                0
temp_lag_3                0
pm25_lag_6                0
temp_l

  super().__init__(**kwargs)


Epoch 1/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - loss: 0.3828 - val_loss: 0.0017
Epoch 2/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0052 - val_loss: 8.5742e-04
Epoch 3/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0033 - val_loss: 5.6921e-04
Epoch 4/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0022 - val_loss: 4.5226e-04
Epoch 5/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0020 - val_loss: 4.2009e-04
Epoch 6/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0023 - val_loss: 4.1995e-04
Epoch 7/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0023 - val_loss: 4.2737e-04
Epoch 8/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0026 - val_loss: 3.9935e-04
Epoch 9/50




Horizon 3: Shape before final check: (4022, 20)
Horizon 3: NaN counts: 0
Epoch 1/50


  super().__init__(**kwargs)


[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.2130 - val_loss: 0.0012
Epoch 2/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0041 - val_loss: 8.8623e-04
Epoch 3/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0034 - val_loss: 7.3383e-04
Epoch 4/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0032 - val_loss: 6.7222e-04
Epoch 5/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0025 - val_loss: 5.2197e-04
Epoch 6/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0028 - val_loss: 4.7934e-04
Epoch 7/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0025 - val_loss: 4.1326e-04
Epoch 8/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0033 - val_loss: 3.5893e-04
Epoch 9/50
[1m101/101



Horizon 6: Shape before final check: (4019, 20)
Horizon 6: NaN counts: 0
Epoch 1/50


  super().__init__(**kwargs)


[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - loss: 0.3669 - val_loss: 0.0025
Epoch 2/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0074 - val_loss: 0.0012
Epoch 3/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0049 - val_loss: 7.5492e-04
Epoch 4/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0038 - val_loss: 5.8485e-04
Epoch 5/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0044 - val_loss: 6.2630e-04
Epoch 6/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0029 - val_loss: 5.2584e-04
Epoch 7/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0029 - val_loss: 5.8077e-04
Epoch 8/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0038 - val_loss: 4.2929e-04
Epoch 9/50
[1m101/101[0m 



Horizon 12: Shape before final check: (4013, 20)
Horizon 12: NaN counts: 0
Epoch 1/50


  super().__init__(**kwargs)


[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.3665 - val_loss: 0.0032
Epoch 2/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0065 - val_loss: 0.0013
Epoch 3/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0034 - val_loss: 8.1287e-04
Epoch 4/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0043 - val_loss: 7.2475e-04
Epoch 5/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0034 - val_loss: 5.8877e-04
Epoch 6/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0027 - val_loss: 4.6164e-04
Epoch 7/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0025 - val_loss: 5.1439e-04
Epoch 8/50
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0025 - val_loss: 3.7201e-04
Epoch 9/50
[1m101/101[0m 



Horizon 24: Shape before final check: (4001, 20)
Horizon 24: NaN counts: 0
Epoch 1/50


  super().__init__(**kwargs)


[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 0.3625 - val_loss: 0.0020
Epoch 2/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0058 - val_loss: 0.0013
Epoch 3/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0046 - val_loss: 0.0012
Epoch 4/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0032 - val_loss: 8.5049e-04
Epoch 5/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0030 - val_loss: 7.7072e-04
Epoch 6/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0026 - val_loss: 5.9895e-04
Epoch 7/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0035 - val_loss: 4.9776e-04
Epoch 8/50
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.0023 - val_loss: 4.7220e-04
Epoch 9/50
[1m100/100[0m [32

