In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import os
import joblib
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import warnings

warnings.filterwarnings("ignore")

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
def classify_quirky_aqi_category(pm25_value):
    if pm25_value <= 30: return "Crystal Clear Skies ☀️"
    elif pm25_value <= 60: return "Light Haze ☁️"
    elif pm25_value <= 90: return "Urban Fog 🏙️"
    elif pm25_value <= 120: return "Smog Alert 🏭"
    elif pm25_value <= 250: return "Pea Soup Air 🍲"
    else: return "Code Red Atmosphere 🚨"

In [3]:
df = pd.read_csv("Featured_Engineered_Dataset.csv", index_col='time', parse_dates=True)
df.head()

Unnamed: 0_level_0,pm2_5,temperature_2m,relativehumidity_2m,precipitation,windspeed_10m,hour,day_of_week,month,pm2_5_lag_1h,pm2_5_lag_24h,pm2_5_lag_2h,pm2_5_lag_3h,pm2_5_lag_6h,pm2_5_lag_12h,pm2_5_lag_48h,city_chennai,city_delhi,city_mumbai
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2022-01-03 00:00:00,17.6,23.0,71,0.0,8.3,0,0,1,17.6,17.6,17.6,17.6,17.6,17.6,17.6,False,False,True
2022-01-03 01:00:00,17.6,22.2,75,0.0,6.0,1,0,1,17.6,17.6,17.6,17.6,17.6,17.6,17.6,False,False,True
2022-01-03 02:00:00,17.6,21.6,79,0.0,5.2,2,0,1,17.6,17.6,17.6,17.6,17.6,17.6,17.6,False,False,True
2022-01-03 03:00:00,17.6,21.5,79,0.0,6.1,3,0,1,17.6,17.6,17.6,17.6,17.6,17.6,17.6,False,False,True
2022-01-03 04:00:00,17.6,21.6,78,0.0,8.1,4,0,1,17.6,17.6,17.6,17.6,17.6,17.6,17.6,False,False,True


In [4]:
df['aqi_category'] = df['pm2_5'].apply(classify_quirky_aqi_category)
    
label_encoder = LabelEncoder()
df['aqi_category_encoded'] = label_encoder.fit_transform(df['aqi_category'])
quirky_category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Quirky Category Mapping:", quirky_category_mapping)

Quirky Category Mapping: {'Code Red Atmosphere 🚨': 0, 'Crystal Clear Skies ☀️': 1, 'Light Haze ☁️': 2, 'Pea Soup Air 🍲': 3, 'Smog Alert 🏭': 4, 'Urban Fog 🏙️': 5}


In [5]:
y = df['aqi_category_encoded']
X = df.drop(['pm2_5', 'aqi_category', 'aqi_category_encoded'], axis=1)

In [6]:
split_date = '2025-01-01'
X_train = X[X.index < split_date]
y_train = y[y.index < split_date]
X_test = X[X.index >= split_date]
y_test = y[y.index >= split_date]
print(f"Train set: {X_train.shape[0]}, Test set: {X_test.shape[0]}")

Train set: 105024, Test set: 28512


In [7]:
model_clf = RandomForestClassifier(
        n_estimators=100, max_depth=15, random_state=42, n_jobs=-1,
        min_samples_leaf=5, class_weight='balanced'
    )
model_clf.fit(X_train, y_train)

In [8]:
y_pred = model_clf.predict(X_test)
y_pred_category = label_encoder.inverse_transform(y_pred)
y_test_category = label_encoder.inverse_transform(y_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy*100:.2f}%")
report = classification_report(y_test_category, y_pred_category, zero_division=0)
print(report)

Test Set Accuracy: 93.29%
                        precision    recall  f1-score   support

 Code Red Atmosphere 🚨       0.76      0.72      0.74        36
Crystal Clear Skies ☀️       0.98      0.97      0.98     16395
         Light Haze ☁️       0.91      0.89      0.90      7069
        Pea Soup Air 🍲       0.88      0.86      0.87       665
          Smog Alert 🏭       0.76      0.86      0.81      1232
          Urban Fog 🏙️       0.82      0.88      0.85      3115

              accuracy                           0.93     28512
             macro avg       0.85      0.87      0.86     28512
          weighted avg       0.94      0.93      0.93     28512



In [9]:
MODEL_CLF_FILE = 'classification_model.joblib'
MAPPING_FILE = 'quirky_category_mapping.joblib'
joblib.dump(model_clf, MODEL_CLF_FILE)
joblib.dump(quirky_category_mapping, MAPPING_FILE)

['quirky_category_mapping.joblib']