In [1]:
import os
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

warnings.filterwarnings("ignore")

RANDOM_STATE = 42
DEFAULT_PATH = '/mnt/data/final_dataset.csv'

def load_csv_interactive(default_path=DEFAULT_PATH):
    if os.path.exists(default_path):
        print(f"Found file at {default_path} — loading...")
        df = pd.read_csv(default_path)
        return df, default_path
    else:
        from google.colab import files
        uploaded = files.upload()
        fname = list(uploaded.keys())[0]
        df = pd.read_csv(fname)
        return df, fname

df_raw, data_source = load_csv_interactive()
print("Source:", data_source)
print("Shape:", df_raw.shape)
display(df_raw.head())


df_model = df_raw.copy()

df_model["Datetime"] = pd.to_datetime(
    dict(year=df_model["Year"], month=df_model["Month"], day=df_model["Date"])
)

df_model = df_model.sort_values("Datetime").reset_index(drop=True)
display(df_model.head())

def categorize_aqi(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Satisfactory"
    elif aqi <= 200:
        return "Moderate"
    elif aqi <= 300:
        return "Poor"
    elif aqi <= 400:
        return "Very Poor"
    else:
        return "Severe"

df_model["AQI_Category"] = df_model["AQI"].apply(categorize_aqi)
df_model[["Datetime", "AQI", "AQI_Category"]].head(10)


pollutants = ["PM2.5", "PM10", "NO2", "SO2", "CO", "Ozone"]

for col in pollutants:
    df_model[f"{col}_lag1"] = df_model[col].shift(1)
    df_model[f"{col}_lag2"] = df_model[col].shift(2)
    df_model[f"{col}_lag7"] = df_model[col].shift(7)

for col in pollutants:
    df_model[f"{col}_roll3"] = df_model[col].rolling(3).mean()
    df_model[f"{col}_roll7"] = df_model[col].rolling(7).mean()

df_model = df_model.dropna().reset_index(drop=True)
display(df_model.head())

def month_to_season(m):
    if m in [12, 1, 2]:
        return 0
    elif m in [3, 4, 5]:
        return 1
    elif m in [6, 7, 8, 9]:
        return 2
    else:
        return 3

df_model["Season"] = df_model["Month"].apply(month_to_season)

df_model["Month_sin"] = np.sin(2 * np.pi * df_model["Month"] / 12.0)
df_model["Month_cos"] = np.cos(2 * np.pi * df_model["Month"] / 12.0)

df_model["Is_Weekend"] = (df_model["Days"] >= 6).astype(int)

display(df_model[["Datetime","Month","Season","Month_sin","Month_cos","Is_Weekend"]].head())


exclude_cols = ["AQI_Category", "Datetime", "Date", "Month", "Year", "AQI"]

feature_cols = [c for c in df_model.columns if c not in exclude_cols]

print("Total Features Used:", len(feature_cols))

X = df_model[feature_cols].values
y = df_model["AQI_Category"].values

train_mask = df_model["Year"] <= 2022
val_mask   = df_model["Year"] == 2023
test_mask  = df_model["Year"] == 2024

X_train, y_train = X[train_mask], y[train_mask]
X_val, y_val     = X[val_mask], y[val_mask]
X_test, y_test   = X[test_mask], y[test_mask]

X_train.shape, X_val.shape, X_test.shape

best_acc = 0.0
best_params = None

n_estimators_list = [300, 500, 800]
max_depth_list = [None, 12, 20]
min_samples_leaf_list = [1, 2, 4]

for n in n_estimators_list:
    for d in max_depth_list:
        for l in min_samples_leaf_list:
            rf = RandomForestClassifier(
                n_estimators=n,
                max_depth=d,
                min_samples_leaf=l,
                n_jobs=-1,
                class_weight="balanced",
                random_state=RANDOM_STATE
            )
            rf.fit(X_train, y_train)
            y_val_pred = rf.predict(X_val)
            acc = accuracy_score(y_val, y_val_pred)

            if acc > best_acc:
                best_acc = acc
                best_params = {"n_estimators": n, "max_depth": d, "min_samples_leaf": l}

best_acc, best_params



trainval_mask = df_model["Year"] <= 2023
X_trainval = X[trainval_mask]
y_trainval = y[trainval_mask]

rf_final = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_leaf=best_params["min_samples_leaf"],
    n_jobs=-1,
    class_weight="balanced",
    random_state=RANDOM_STATE
)

rf_final.fit(X_trainval, y_trainval)

y_test_pred = rf_final.predict(X_test)

test_acc = accuracy_score(y_test, y_test_pred)

print("True Forecast Accuracy (2024):", test_acc)
print("\nCONFUSION MATRIX:")
print(confusion_matrix(y_test, y_test_pred))
print("\nCLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred))

event_df = df_model[df_model["AQI_Category"].isin(["Very Poor", "Severe"])]

event_summary = (
    event_df.groupby(["Year","AQI_Category"])
    .size()
    .reset_index(name="Days")
)

event_summary

test_analysis_df = df_model[test_mask].copy()
test_analysis_df["Predicted"] = y_test_pred

extreme_test = test_analysis_df[
    test_analysis_df["AQI_Category"].isin(["Very Poor","Severe"])
]

confusion_matrix(
    extreme_test["AQI_Category"],
    extreme_test["Predicted"],
    labels=["Very Poor","Severe"]
)


last_features = X[-1].reshape(1,-1)
tomorrow_pred_rf = rf_final.predict(last_features)[0]
tomorrow_pred_rf


Saving final_dataset.csv to final_dataset.csv
Source: final_dataset.csv
Shape: (1461, 12)


Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263
3,4,1,2021,0,1,89.55,132.08,153.98,10.42,1.01,49.19,207
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149


Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI,Datetime
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462,2021-01-01
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482,2021-01-02
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263,2021-01-03
3,4,1,2021,0,1,89.55,132.08,153.98,10.42,1.01,49.19,207,2021-01-04
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149,2021-01-05


Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,...,PM10_roll3,PM10_roll7,NO2_roll3,NO2_roll7,SO2_roll3,SO2_roll7,CO_roll3,CO_roll7,Ozone_roll3,Ozone_roll7
0,8,1,2021,0,5,140.05,184.29,102.61,10.34,0.79,...,165.203333,212.031429,114.73,120.661429,10.146667,9.524286,0.9,1.192857,45.376667,42.131429
1,9,1,2021,0,6,144.01,192.43,108.43,10.41,0.85,...,169.3,159.242857,103.303333,128.601429,10.186667,10.271429,0.79,0.942857,44.63,45.817143
2,10,1,2021,1,7,131.57,180.38,93.31,9.92,0.69,...,185.7,150.862857,101.45,117.51,10.223333,10.127143,0.776667,0.841429,45.75,46.912857
3,11,1,2021,0,1,135.92,208.99,105.07,10.49,0.75,...,193.933333,161.85,102.27,110.522857,10.273333,10.137143,0.763333,0.804286,49.75,47.751429
4,12,1,2021,0,2,172.48,220.74,111.52,10.21,0.91,...,203.37,185.45,103.3,108.931429,10.206667,10.21,0.783333,0.842857,49.57,46.724286


Unnamed: 0,Datetime,Month,Season,Month_sin,Month_cos,Is_Weekend
0,2021-01-08,1,0,0.5,0.866025,0
1,2021-01-09,1,0,0.5,0.866025,1
2,2021-01-10,1,0,0.5,0.866025,1
3,2021-01-11,1,0,0.5,0.866025,0
4,2021-01-12,1,0,0.5,0.866025,0


Total Features Used: 42
True Forecast Accuracy (2024): 0.7896174863387978

CONFUSION MATRIX:
[[ 8  0  0  1  0  0]
 [ 0 99 18  7  0  1]
 [ 0  7 96  0  0  2]
 [11  3  0 55  0  0]
 [ 0  0  0  0  3  5]
 [ 0  0 20  0  2 28]]

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

        Good       0.42      0.89      0.57         9
    Moderate       0.91      0.79      0.85       125
        Poor       0.72      0.91      0.80       105
Satisfactory       0.87      0.80      0.83        69
      Severe       0.60      0.38      0.46         8
   Very Poor       0.78      0.56      0.65        50

    accuracy                           0.79       366
   macro avg       0.72      0.72      0.69       366
weighted avg       0.81      0.79      0.79       366



'Poor'