In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

In [None]:
# Mount Google Drive to access datasets
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the datasets
train = pd.read_csv('/content/drive/MyDrive/Time_Series_Assessment/data/train.csv')
test = pd.read_csv('//content/drive/MyDrive/Time_Series_Assessment/data/test.csv')

In [None]:
train = train.drop(columns=["No"])
test = test.drop(columns=["No"])

In [None]:
# Interpolate missing pm2.5 values
train["pm2.5"] = train["pm2.5"].interpolate(method="linear")

# Drop any remaining missing pm2.5 values (first rows)
train = train.dropna(subset=["pm2.5"]).reset_index(drop=True)

In [None]:
train["datetime"] = pd.to_datetime(train["datetime"])
test["datetime"] = pd.to_datetime(test["datetime"])

print("Train shape after cleaning:", train.shape)
print("Test shape:", test.shape)

Train shape after cleaning: (30652, 11)
Test shape: (13148, 10)


In [None]:
# Scale features
feature_cols = [c for c in train.columns if c not in ["datetime", "pm2.5"]]
feature_scaler = StandardScaler()
X_data = feature_scaler.fit_transform(train[feature_cols].values)

y_scaler = StandardScaler()
y_data = y_scaler.fit_transform(train[["pm2.5"]].values).flatten()

In [None]:
SEQ_LEN = 24

X_seq, y_seq = [], []
for i in range(len(X_data) - SEQ_LEN):
    X_seq.append(X_data[i:i+SEQ_LEN])
    y_seq.append(y_data[i+SEQ_LEN])

X_seq = np.array(X_seq)
y_seq = np.array(y_seq)

In [None]:
split_idx = int(len(X_seq) * 0.8)
X_train, X_val = X_seq[:split_idx], X_seq[split_idx:]
y_train, y_val = y_seq[:split_idx], y_seq[split_idx:]

print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)

X_train shape: (24502, 24, 9)
X_val shape: (6126, 24, 9)


In [None]:
#Build LSTM Model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(SEQ_LEN, len(feature_cols))),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1)
])

def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))

model.compile(optimizer=Adam(learning_rate=0.001), loss="mse", metrics=[rmse])

  super().__init__(**kwargs)


In [None]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=64,
    verbose=1
)

Epoch 1/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 37ms/step - loss: 0.5759 - rmse: 1.0994 - val_loss: 0.7176 - val_rmse: 1.0021
Epoch 2/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 37ms/step - loss: 0.4347 - rmse: 1.1585 - val_loss: 0.7586 - val_rmse: 1.0556
Epoch 3/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 36ms/step - loss: 0.4032 - rmse: 1.1780 - val_loss: 0.8456 - val_rmse: 1.0835
Epoch 4/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 36ms/step - loss: 0.3714 - rmse: 1.1930 - val_loss: 0.7291 - val_rmse: 1.0618
Epoch 5/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 36ms/step - loss: 0.3333 - rmse: 1.1968 - val_loss: 0.8084 - val_rmse: 1.1099
Epoch 6/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 36ms/step - loss: 0.3178 - rmse: 1.2199 - val_loss: 0.8261 - val_rmse: 1.1010
Epoch 7/10
[1m383/383[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [None]:
# validation set
y_val_pred_scaled = model.predict(X_val).flatten()
y_val_pred = y_scaler.inverse_transform(y_val_pred_scaled.reshape(-1,1)).flatten()
y_val_true = y_scaler.inverse_transform(y_val.reshape(-1,1)).flatten()

from sklearn.metrics import mean_squared_error
import numpy as np

rmse_val = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
print("Validation RMSE on original scale:", rmse_val)


[1m192/192[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step
Validation RMSE on original scale: 83.77214409315386


In [None]:
# Prepare test sequences like training
test_data = test[feature_cols].values

X_test = []
for i in range(len(test_data)):
    if i < SEQ_LEN:
        seq = test_data[:i+1]
        pad = np.zeros((SEQ_LEN - seq.shape[0], seq.shape[1]))
        seq = np.vstack([pad, seq])
    else:
        seq = test_data[i-SEQ_LEN+1:i+1]
    X_test.append(seq)

X_test = np.array(X_test, dtype=np.float32)

# Predict
y_test_pred_scaled = model.predict(X_test).flatten()
y_test_pred = y_scaler.inverse_transform(y_test_pred_scaled.reshape(-1,1)).flatten()

# Build submission
submission = pd.DataFrame({
    "row ID": test["datetime"].dt.strftime("%Y-%m-%d %-H:%M:%S"),
    "pm2.5": y_test_pred
})

# Save
submission.to_csv("/content/drive/MyDrive/Time_Series_Assessment/submission.csv", index=False)
print("submission saved:", submission.shape)


[1m411/411[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step
✅ submission saved: (13148, 2)
