# LSTM with Random Forest

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Attention, Concatenate, BatchNormalization
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Load Dataset
file_path = "/kaggle/input/beijing/Beijing.csv"  # Replace with your file path
df = pd.read_csv(file_path)
# df = df[:50000]
# Explore Dataset
print(df.head())

# Handle Missing Values
df.fillna(method='ffill', inplace=True)  # Forward-fill for missing values
df.fillna(method='bfill', inplace=True)  # Backward-fill as a fallback

# Encode Categorical Data
label_encoder = LabelEncoder()
df['wd'] = label_encoder.fit_transform(df['wd'])  # Encode wind direction
df['station'] = label_encoder.fit_transform(df['station'])

# Feature Engineering
df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
df.set_index('datetime', inplace=True)

# Lag Features (e.g., PM2.5 of previous hours)
for lag in [1, 3, 6, 12, 24]:
    df[f'PM2.5_lag_{lag}'] = df['PM2.5'].shift(lag)

# Rolling Statistics
df['PM2.5_roll_mean'] = df['PM2.5'].rolling(window=3).mean()
df['PM2.5_roll_std'] = df['PM2.5'].rolling(window=3).std()

# Drop Rows with NaNs from Lag Features
df.dropna(inplace=True)

# Select Features and Target
features = ['PM10', 'SO2', 'NO2', 'CO', 'O3', 'TEMP', 'PRES', 'DEWP', 'RAIN', 'wd', 'WSPM', 'station'] + \
           [f'PM2.5_lag_{lag}' for lag in [1, 3, 6, 12, 24]] + ['PM2.5_roll_mean', 'PM2.5_roll_std']
target = 'PM2.5'

X = df[features]
y = df[target]

# Normalize Data
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1))

# Reshape for LSTM (Samples, Timesteps, Features)
timesteps = 24  # Use last 24 hours for prediction
X_lstm = np.array([X_scaled[i - timesteps:i] for i in range(timesteps, len(X_scaled))])
y_lstm = y_scaled[timesteps:]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_lstm, y_lstm, test_size=0.2, random_state=42, shuffle=False)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# LSTM Feature Extraction
def build_lstm_feature_model(timesteps, features):
    inputs = Input(shape=(timesteps, features))
    lstm_out = LSTM(128, return_sequences=False)(inputs)
    dense_out = Dense(64, activation='relu')(lstm_out)
    outputs = Dense(1, activation='linear')(dense_out)

    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Build and Train LSTM
lstm_model = build_lstm_feature_model(timesteps, X_train.shape[2])
lstm_model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=32,
    verbose=1
)

# Extract LSTM Features
lstm_train_features = lstm_model.predict(X_train)
lstm_test_features = lstm_model.predict(X_test)

# Combine LSTM Outputs with Original Features
X_train_combined = np.hstack([lstm_train_features, X_train[:, -1, :]])  # Last timestep for static features
X_test_combined = np.hstack([lstm_test_features, X_test[:, -1, :]])

# Train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train_combined, y_train.ravel())

# Make Predictions
y_pred = rf.predict(X_test_combined)

# Rescale Predictions and Ground Truth
y_pred_rescaled = scaler_y.inverse_transform(y_pred.reshape(-1, 1))
y_test_rescaled = scaler_y.inverse_transform(y_test)

# Evaluate Performance
r2 = r2_score(y_test_rescaled, y_pred_rescaled)
mae = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
mse = mean_squared_error(y_test_rescaled, y_pred_rescaled)
rmse = np.sqrt(mean_squared_error(y_test_rescaled, y_pred_rescaled))
mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}")


## of the above model is Results 

R² Score: 0.9562 <br>
MAE: 9.8764 <br>
MSE: 312.4317 <br>
RMSE: 17.6757 <br>
MAPE: 18.2978