<a href="https://colab.research.google.com/github/Theekshana-se/Air-QualityPredict-DeepLearningProject/blob/main/IT21252990_GRU2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from keras.models import Sequential
from keras.layers import GRU, Dense, Dropout

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Load your cleaned dataset
df = pd.read_csv(
    '/content/drive/MyDrive/Colab Notebooks/Deep-Learning-Project-main/DataSet/airData.csv',
    encoding='ISO-8859-1'
)

# Select useful columns only
df = df[['sampling_date', 'state', 'location', 'so2', 'no2', 'rspm']]
df['sampling_date'] = pd.to_datetime(df['sampling_date'], errors='coerce')

# Drop rows missing key values
df = df.dropna(subset=['sampling_date', 'location', 'so2', 'no2', 'rspm'])

# Sort for time-series consistency
df = df.sort_values(['location', 'sampling_date'])
df.reset_index(drop=True, inplace=True)

  df = pd.read_csv(
  df['sampling_date'] = pd.to_datetime(df['sampling_date'], errors='coerce')


In [4]:
le = LabelEncoder()
df['city_id'] = le.fit_transform(df['location'])
print(df[['location', 'city_id']].head(10))



     location  city_id
0  ANKLESHWAR        0
1  ANKLESHWAR        0
2  ANKLESHWAR        0
3  ANKLESHWAR        0
4  ANKLESHWAR        0
5  ANKLESHWAR        0
6  ANKLESHWAR        0
7  ANKLESHWAR        0
8  ANKLESHWAR        0
9  ANKLESHWAR        0


In [5]:
scaler = MinMaxScaler()
df[['so2', 'no2', 'rspm']] = scaler.fit_transform(df[['so2', 'no2', 'rspm']])


In [6]:
def create_sequences_for_all_cities(data, time_steps=10):
    X_all, y_all = [], []
    grouped = data.groupby('city_id')

    for city_id, group in grouped:
        group = group.sort_values('sampling_date')
        features = group[['so2', 'no2', 'rspm', 'city_id']].values
        for i in range(len(features) - time_steps):
            X_all.append(features[i:i+time_steps])
            y_all.append(features[i+time_steps, 2])  # target = RSPM
    return np.array(X_all), np.array(y_all)

X, y = create_sequences_for_all_cities(df, 10)

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (362191, 10, 4)
Shape of y: (362191,)


In [7]:
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print("Train:", X_train.shape, " Test:", X_test.shape)


Train: (289752, 10, 4)  Test: (72439, 10, 4)


In [8]:
model = Sequential()
model.add(GRU(128, input_shape=(10, 4), return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')
model.summary()


  super().__init__(**kwargs)


In [None]:
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=64,
    validation_data=(X_test, y_test),
    verbose=1
)


Epoch 1/50
[1m4528/4528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 21ms/step - loss: 0.0817 - val_loss: 1.9285e-04
Epoch 2/50
[1m4528/4528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 21ms/step - loss: 1.2744e-04 - val_loss: 8.1308e-05
Epoch 3/50
[1m4528/4528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 21ms/step - loss: 1.0381e-04 - val_loss: 9.3143e-05
Epoch 4/50
[1m4528/4528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 21ms/step - loss: 7.9363e-05 - val_loss: 4.5628e-05
Epoch 5/50
[1m4528/4528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 21ms/step - loss: 7.3546e-05 - val_loss: 5.1834e-05
Epoch 6/50
[1m4528/4528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 21ms/step - loss: 7.3097e-05 - val_loss: 6.9185e-05
Epoch 7/50
[1m4528/4528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 21ms/step - loss: 7.6812e-05 - val_loss: 2.9736e-05
Epoch 8/50
[1m4528/4528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.title('Multi-City GRU Training vs Validation Loss')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R² Score: {r2:.4f}")


In [None]:
city_id = le.transform(['Mumbai'])[0]
city_df = df[df['city_id'] == city_id].copy()
city_df = city_df.sort_values('sampling_date')

last_seq = city_df[['so2', 'no2', 'rspm', 'city_id']].values[-10:]
future_preds = []

for _ in range(7):  # predict next 7 days
    pred = model.predict(last_seq.reshape(1, 10, 4))
    future_preds.append(pred[0,0])
    new_row = np.array([[0, 0, pred[0,0], city_id]])  # placeholders for so2,no2
    last_seq = np.concatenate((last_seq[1:], new_row), axis=0)


In [None]:
future_preds_real = scaler.inverse_transform(
    np.concatenate((np.zeros((len(future_preds), 2)), np.array(future_preds).reshape(-1,1)), axis=1)
)[:, 2]

print("Next 7 days RSPM predictions for Mumbai:")
print(future_preds_real)

In [None]:
last_date = city_df['sampling_date'].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=7)

plt.figure(figsize=(10,5))
plt.plot(future_dates, future_preds_real, marker='o', color='orange', linewidth=2)
plt.title(f'Predicted RSPM Levels for Next 7 Days — {le.inverse_transform([city_id])[0]}')
plt.xlabel('Date')
plt.ylabel('Predicted RSPM (µg/m³)')
plt.grid(True, linestyle='--', alpha=0.6)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()