In [None]:
# 1️⃣ Imports
import os
import sys
import pandas as pd
import numpy as np
import joblib, pickle, zipfile
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet

In [None]:
# 2️⃣ Settings and Folder Setup

DATA_PATH = "/content/cleaned_air_quality.csv"   # path to your dataset
MODEL_DIR = "/content/models"
MIN_ROWS_TO_TRAIN = 50   # skip tiny stations
np.random.seed(42)

# Ensure model folder and working directory
os.chdir("/content")
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)
print("Model folder ready at:", MODEL_DIR)

Model folder ready at: /content/models


In [None]:
# 3️⃣ Load and Prepare Dataset

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset not found at {DATA_PATH} — upload it first.")

data = pd.read_csv(DATA_PATH)
print("Loaded dataset shape:", data.shape)
print("Columns:", list(data.columns))

# Confirm required columns
if "City" not in data.columns:
    raise KeyError("Column 'City' not found in dataset.")
TARGET = "PM2.5"
if TARGET not in data.columns:
    raise KeyError(f"Target column '{TARGET}' not found in dataset.")

# Handle Date column
if "Date" in data.columns:
    data["Date"] = pd.to_datetime(data["Date"], errors="coerce")
else:
    data = data.reset_index().rename(columns={"index": "Date"})
    data["Date"] = pd.date_range(start="2020-01-01", periods=len(data))

# Drop missing and sort
data[TARGET] = pd.to_numeric(data[TARGET], errors="coerce")
data = data.dropna(subset=[TARGET]).sort_values(by=["City", "Date"]).reset_index(drop=True)
print("After dropna:", data.shape)

Loaded dataset shape: (29531, 16)
Columns: ['City', 'Date', 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene', 'AQI', 'AQI_Bucket']
After dropna: (29531, 16)


In [None]:
# 4️⃣ Utility Functions

def evaluate_model(actual, predicted):
    try:
        mae = mean_absolute_error(actual, predicted)
        rmse = np.sqrt(mean_squared_error(actual, predicted))
        return float(mae), float(rmse)
    except Exception:
        return (np.nan, np.nan)

def train_models_for_series(series):
    results, models = {}, {}

    if len(series) < 10:
        return models, results

    # Split
    n = len(series)
    split = int(0.8 * n)
    train, test = series.iloc[:split].copy(), series.iloc[split:].copy()
    actual = test.values

    # ARIMA
    try:
        arima_model = ARIMA(train, order=(2, 1, 2))
        arima_fit = arima_model.fit()
        pred = arima_fit.forecast(steps=len(test))
        models["ARIMA"] = arima_fit
        results["ARIMA"] = evaluate_model(actual, pred)
    except Exception:
        results["ARIMA"] = (np.nan, np.nan)

    # Prophet
    try:
        dfp = pd.DataFrame({"ds": pd.date_range(start="2020-01-01", periods=len(series)), "y": series.values})
        splitp = int(0.8 * len(dfp))
        trainp, testp = dfp.iloc[:splitp], dfp.iloc[splitp:]
        prophet_model = Prophet()
        prophet_model.fit(trainp)
        future = prophet_model.make_future_dataframe(periods=len(testp))
        forecast = prophet_model.predict(future)
        pred = forecast["yhat"].iloc[-len(testp):].values
        models["Prophet"] = prophet_model
        results["Prophet"] = evaluate_model(actual, pred)
    except Exception:
        results["Prophet"] = (np.nan, np.nan)

    # LSTM
    try:
        scaler = MinMaxScaler()
        arr = scaler.fit_transform(series.values.reshape(-1, 1))

        def create_ds(a, lag=5):
            X, Y = [], []
            for i in range(lag, len(a)):
                X.append(a[i - lag:i, 0])
                Y.append(a[i, 0])
            return np.array(X), np.array(Y)

        lag = 5
        X, Y = create_ds(arr, lag)
        if len(X) > 0:
            X = X.reshape((X.shape[0], X.shape[1], 1))
            split_l = int(0.8 * len(X))
            X_train, X_test = X[:split_l], X[split_l:]
            y_train, y_test = Y[:split_l], Y[split_l:]

            model_lstm = Sequential([LSTM(50, activation="relu", input_shape=(X_train.shape[1], 1)), Dense(1)])
            model_lstm.compile(optimizer="adam", loss="mse")
            model_lstm.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)

            y_pred = model_lstm.predict(X_test)
            y_pred_inv = scaler.inverse_transform(y_pred)
            y_test_inv = scaler.inverse_transform(y_test.reshape(-1, 1))

            models["LSTM"] = model_lstm
            results["LSTM"] = evaluate_model(y_test_inv.flatten(), y_pred_inv.flatten())
        else:
            results["LSTM"] = (np.nan, np.nan)
    except Exception:
        results["LSTM"] = (np.nan, np.nan)

    # XGBoost
    try:
        df_lag = pd.DataFrame({TARGET: series.values})
        for i in range(1, 4):
            df_lag[f"lag_{i}"] = df_lag[TARGET].shift(i)
        df_lag = df_lag.dropna().reset_index(drop=True)
        if len(df_lag) >= 10:
            X = df_lag[[f"lag_{i}" for i in range(1, 4)]].values
            y = df_lag[TARGET].values
            split_x = int(0.8 * len(X))
            X_train, X_test = X[:split_x], X[split_x:]
            y_train, y_test = y[:split_x], y[split_x:]

            xmodel = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=150, verbosity=0)
            xmodel.fit(X_train, y_train)
            pred = xmodel.predict(X_test)
            models["XGBoost"] = xmodel
            results["XGBoost"] = evaluate_model(y_test, pred)
        else:
            results["XGBoost"] = (np.nan, np.nan)
    except Exception:
        results["XGBoost"] = (np.nan, np.nan)

    return models, results


In [None]:
# 5️⃣ Train All Cities + Overall

summary_results, saved_files = {}, []

# Overall training
try:
    overall_series = data.groupby("Date")[TARGET].mean().dropna()
    print(f"Training Overall series (len={len(overall_series)})")
    models, metrics = train_models_for_series(overall_series)

    if metrics:
        rmse_map = {k: metrics[k][1] for k in metrics}
        valid = {k: v for k, v in rmse_map.items() if not np.isnan(v)}
        chosen = min(valid, key=valid.get) if valid else next(iter(models), None)
        if chosen:
            save_base = os.path.join(MODEL_DIR, "Overall_best_model")
            if chosen == "LSTM":
                models[chosen].save(save_base + ".h5")
            else:
                joblib.dump(models[chosen], save_base + ".pkl")
            saved_files.append(save_base)
        summary_results["Overall"] = metrics
        print("Overall metrics:", metrics)
except Exception as e:
    print("Overall training error:", e)

# Per city
cities = sorted(data["City"].dropna().unique())
print("Cities to train:", len(cities))

for city in cities:
    city_df = data[data["City"] == city].sort_values("Date")
    if len(city_df) < MIN_ROWS_TO_TRAIN:
        print(f"Skipping {city} ({len(city_df)} rows)")
        continue
    print(f"\nTraining for {city}")
    series = city_df[TARGET].reset_index(drop=True)
    models, metrics = train_models_for_series(series)
    summary_results[city] = metrics

    rmse_map = {k: metrics[k][1] for k in metrics if not np.isnan(metrics[k][1])}
    chosen = min(rmse_map, key=rmse_map.get) if rmse_map else next(iter(models), None)

    if chosen:
        save_base = os.path.join(MODEL_DIR, f"{city.replace(' ', '_')}_best_model")
        if chosen == "LSTM":
            models[chosen].save(save_base + ".h5")
        else:
            joblib.dump(models[chosen], save_base + ".pkl")
        saved_files.append(save_base)
        print(f"✅ Saved {chosen} for {city}")


Training Overall series (len=2009)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
Overall metrics: {'ARIMA': (19.001027313759092, 23.432213715771976), 'Prophet': (nan, nan), 'LSTM': (6.167171189379751, 8.815359773742715), 'XGBoost': (5.817822052399624, 8.514096858871405)}
Cities to train: 26

Training for Ahmedabad


  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step




✅ Saved LSTM for Ahmedabad

Training for Aizawl


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
✅ Saved ARIMA for Aizawl

Training for Amaravati


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step




✅ Saved LSTM for Amaravati

Training for Amritsar


  super().__init__(**kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step




✅ Saved LSTM for Amritsar

Training for Bengaluru


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step




✅ Saved LSTM for Bengaluru

Training for Bhopal


  super().__init__(**kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step




✅ Saved LSTM for Bhopal

Training for Brajrajnagar


  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


✅ Saved LSTM for Brajrajnagar

Training for Chandigarh


  super().__init__(**kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
✅ Saved XGBoost for Chandigarh

Training for Chennai


  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


  warn('Non-invertible starting MA parameters found.'


✅ Saved LSTM for Chennai

Training for Coimbatore


  super().__init__(**kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
✅ Saved XGBoost for Coimbatore

Training for Delhi


  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step




✅ Saved LSTM for Delhi

Training for Ernakulam


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step
✅ Saved ARIMA for Ernakulam

Training for Gurugram


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  super().__init__(**kwargs)


[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step




✅ Saved LSTM for Gurugram

Training for Guwahati


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  super().__init__(**kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step




✅ Saved LSTM for Guwahati

Training for Hyderabad


  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step




✅ Saved LSTM for Hyderabad

Training for Jaipur


  super().__init__(**kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


✅ Saved LSTM for Jaipur

Training for Jorapokhar


  super().__init__(**kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step




✅ Saved LSTM for Jorapokhar

Training for Kochi


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step
✅ Saved ARIMA for Kochi

Training for Kolkata


  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
✅ Saved XGBoost for Kolkata

Training for Lucknow


  warn('Non-invertible starting MA parameters found.'
  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step




✅ Saved LSTM for Lucknow

Training for Mumbai


  warn('Non-invertible starting MA parameters found.'
  super().__init__(**kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step




✅ Saved LSTM for Mumbai

Training for Patna


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step




✅ Saved LSTM for Patna

Training for Shillong


  super().__init__(**kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step




✅ Saved LSTM for Shillong

Training for Talcher


  super().__init__(**kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
✅ Saved XGBoost for Talcher

Training for Thiruvananthapuram


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  super().__init__(**kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step




✅ Saved LSTM for Thiruvananthapuram

Training for Visakhapatnam


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  super().__init__(**kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step




✅ Saved LSTM for Visakhapatnam


In [None]:
# 6️⃣ Zip and Download Models

print("\n✅ Training finished. Total models saved:", len(saved_files))

zip_path = "/content/models.zip"
with zipfile.ZipFile(zip_path, "w") as z:
    for f in saved_files:
        for ext in [".pkl", ".h5"]:
            if os.path.exists(f + ext):
                z.write(f + ext, arcname=os.path.basename(f + ext))

print("\nZipped models saved to:", zip_path)
print("Listing files in /content/models/:")
for fname in sorted(os.listdir(MODEL_DIR)):
    print(" -", fname)

try:
    from google.colab import files
    files.download(zip_path)
except Exception:
    print("If not in Colab, download manually from /content/models.zip")



✅ Training finished. Total models saved: 27

Zipped models saved to: /content/models.zip
Listing files in /content/models/:
 - Ahmedabad_best_model.h5
 - Aizawl_best_model.pkl
 - Amaravati_best_model.h5
 - Amritsar_best_model.h5
 - Amritsar_best_model.pkl
 - Bengaluru_best_model.h5
 - Bhopal_best_model.h5
 - Bhopal_best_model.pkl
 - Brajrajnagar_best_model.h5
 - Chandigarh_best_model.h5
 - Chandigarh_best_model.pkl
 - Chennai_best_model.h5
 - Coimbatore_best_model.pkl
 - Delhi_best_model.h5
 - Ernakulam_best_model.pkl
 - Gurugram_best_model.h5
 - Guwahati_best_model.h5
 - Hyderabad_best_model.h5
 - Hyderabad_best_model.pkl
 - Jaipur_best_model.h5
 - Jorapokhar_best_model.h5
 - Kochi_best_model.pkl
 - Kolkata_best_model.pkl
 - Lucknow_best_model.h5
 - Mumbai_best_model.h5
 - Overall_best_model.h5
 - Overall_best_model.pkl
 - Patna_best_model.h5
 - Shillong_best_model.h5
 - Talcher_best_model.h5
 - Talcher_best_model.pkl
 - Thiruvananthapuram_best_model.h5
 - Visakhapatnam_best_model.h5

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>