In [None]:
import os
import sys
import warnings
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown


In [None]:
DEFAULT_PATH = '/mnt/data/final_dataset.csv'

def load_csv_interactive(default_path=DEFAULT_PATH):
    """Attempt to load a CSV. If default_path exists, load it; else ask user to upload via file picker."""
    if os.path.exists(default_path):
        print(f"Found file at {default_path} — loading...")
        try:
            df = pd.read_csv(default_path)
            print("Loaded successfully from default path.")
            return df, default_path
        except Exception as e:
            print(f"Failed to load {default_path}: {e}")
    try:
        from google.colab import files
        print("Please upload your Delhi Air Quality CSV file using the file picker...")
        uploaded = files.upload()
        if not uploaded:
            raise FileNotFoundError('No file uploaded')
        fname = list(uploaded.keys())[0]
        df = pd.read_csv(fname)
        print(f"Loaded '{fname}' successfully.")
        return df, fname
    except Exception as e:
        raise RuntimeError('Could not load file — either place it at /mnt/data/final_dataset.csv or use Colab file upload.') from e

try:
    df_raw, data_source = load_csv_interactive()
except Exception as e:
    print('Error loading data:', e)
    raise

print('\nData source:', data_source)
print('Shape:', df_raw.shape)
print('Columns:', list(df_raw.columns)[:50])
print('\nSample rows:')
display(df_raw.head())


Please upload your Delhi Air Quality CSV file using the file picker...


Saving final_dataset.csv to final_dataset.csv
Loaded 'final_dataset.csv' successfully.

Data source: final_dataset.csv
Shape: (1461, 12)
Columns: ['Date', 'Month', 'Year', 'Holidays_Count', 'Days', 'PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'Ozone', 'AQI']

Sample rows:


Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263
3,4,1,2021,0,1,89.55,132.08,153.98,10.42,1.01,49.19,207
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

df_model = df_raw.copy()

df_model['Datetime'] = pd.to_datetime(
    dict(year=df_model['Year'], month=df_model['Month'], day=df_model['Date'])
)

df_model = df_model.sort_values('Datetime').reset_index(drop=True)

df_model.head()

Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,Ozone,AQI,Datetime
0,1,1,2021,0,5,408.8,442.42,160.61,12.95,2.77,43.19,462,2021-01-01
1,2,1,2021,0,6,404.04,561.95,52.85,5.18,2.6,16.43,482,2021-01-02
2,3,1,2021,1,7,225.07,239.04,170.95,10.93,1.4,44.29,263,2021-01-03
3,4,1,2021,0,1,89.55,132.08,153.98,10.42,1.01,49.19,207,2021-01-04
4,5,1,2021,0,2,54.06,55.54,122.66,9.7,0.64,48.88,149,2021-01-05


In [None]:

def categorize_aqi(aqi):
    if aqi <= 50:
        return "Good"
    elif aqi <= 100:
        return "Satisfactory"
    elif aqi <= 200:
        return "Moderate"
    elif aqi <= 300:
        return "Poor"
    elif aqi <= 400:
        return "Very Poor"
    else:
        return "Severe"

df_model["AQI_Category"] = df_model["AQI"].apply(categorize_aqi)

df_model[["AQI", "AQI_Category"]].head(10)

Unnamed: 0,AQI,AQI_Category
0,462,Severe
1,482,Severe
2,263,Poor
3,207,Poor
4,149,Moderate
5,252,Poor
6,288,Poor
7,248,Poor
8,326,Very Poor
9,284,Poor


In [None]:
pollutants = ["PM2.5", "PM10", "NO2", "SO2", "CO", "Ozone", "AQI"]

for col in pollutants:
    df_model[f"{col}_lag1"] = df_model[col].shift(1)
    df_model[f"{col}_lag2"] = df_model[col].shift(2)
    df_model[f"{col}_lag7"] = df_model[col].shift(7)

for col in pollutants:
    df_model[f"{col}_roll3"] = df_model[col].rolling(3).mean()
    df_model[f"{col}_roll7"] = df_model[col].rolling(7).mean()

df_model = df_model.dropna().reset_index(drop=True)

df_model.head()

Unnamed: 0,Date,Month,Year,Holidays_Count,Days,PM2.5,PM10,NO2,SO2,CO,...,NO2_roll3,NO2_roll7,SO2_roll3,SO2_roll7,CO_roll3,CO_roll7,Ozone_roll3,Ozone_roll7,AQI_roll3,AQI_roll7
0,8,1,2021,0,5,140.05,184.29,102.61,10.34,0.79,...,114.73,120.661429,10.146667,9.524286,0.9,1.192857,45.376667,42.131429,262.666667,269.857143
1,9,1,2021,0,6,144.01,192.43,108.43,10.41,0.85,...,103.303333,128.601429,10.186667,10.271429,0.79,0.942857,44.63,45.817143,287.333333,247.571429
2,10,1,2021,1,7,131.57,180.38,93.31,9.92,0.69,...,101.45,117.51,10.223333,10.127143,0.776667,0.841429,45.75,46.912857,286.0,250.571429
3,11,1,2021,0,1,135.92,208.99,105.07,10.49,0.75,...,102.27,110.522857,10.273333,10.137143,0.763333,0.804286,49.75,47.751429,297.0,261.142857
4,12,1,2021,0,2,172.48,220.74,111.52,10.21,0.91,...,103.3,108.931429,10.206667,10.21,0.783333,0.842857,49.57,46.724286,298.0,286.857143


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

exclude_cols = ["AQI_Category", "Datetime", "Date", "Month", "Year"]
feature_cols = [c for c in df_model.columns if c not in exclude_cols]

X = df_model[feature_cols].values
y = df_model["AQI_Category"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

train_mask = df_model["Year"] <= 2023
test_mask  = df_model["Year"] == 2024

best_acc = 0.0
best_k = None
best_pc = None

for pc in range(5, 21):
    pca_temp = PCA(n_components=pc)
    X_pca_temp = pca_temp.fit_transform(X_scaled)

    X_train_temp = X_pca_temp[train_mask]
    X_test_temp  = X_pca_temp[test_mask]

    for k in [3, 5, 7, 9, 11, 13, 15]:
        knn_temp = KNeighborsClassifier(n_neighbors=k)
        knn_temp.fit(X_train_temp, y[train_mask])

        y_pred_temp = knn_temp.predict(X_test_temp)
        acc = accuracy_score(y[test_mask], y_pred_temp)

        if acc > best_acc:
            best_acc = acc
            best_k = k
            best_pc = pc

(best_acc, best_pc, best_k)


(0.6830601092896175, 11, 11)

In [None]:
from sklearn.decomposition import PCA

best_acc, best_pc, best_k = (best_acc, best_pc, best_k)

exclude_cols = ["AQI_Category", "Datetime", "Date", "Month", "Year"]
feature_cols = [c for c in df_model.columns if c not in exclude_cols]

X = df_model[feature_cols].values
y = df_model["AQI_Category"].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=best_pc)
X_pca = pca.fit_transform(X_scaled)

train_mask = df_model["Year"] <= 2023
test_mask  = df_model["Year"] == 2024

X_train = X_pca[train_mask]
y_train = y[train_mask]

X_test  = X_pca[test_mask]
y_test  = y[test_mask]

len(X_train), len(X_test)


(1088, 366)

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_k)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

list(zip(y_test[:10], y_pred[:10]))

[('Poor', 'Poor'),
 ('Poor', 'Poor'),
 ('Poor', 'Poor'),
 ('Very Poor', 'Poor'),
 ('Poor', 'Poor'),
 ('Poor', 'Poor'),
 ('Very Poor', 'Poor'),
 ('Very Poor', 'Poor'),
 ('Poor', 'Poor'),
 ('Moderate', 'Poor')]

In [None]:
print("CONFUSION MATRIX:")
print(confusion_matrix(y_test, y_pred))

print("\nCLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))

CONFUSION MATRIX:
[[ 1  0  0  8  0  0]
 [ 0 89 17 17  0  2]
 [ 0 27 78  0  0  0]
 [ 0  2  1 66  0  0]
 [ 0  0  2  0  1  5]
 [ 0  1 34  0  0 15]]

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

        Good       1.00      0.11      0.20         9
    Moderate       0.75      0.71      0.73       125
        Poor       0.59      0.74      0.66       105
Satisfactory       0.73      0.96      0.82        69
      Severe       1.00      0.12      0.22         8
   Very Poor       0.68      0.30      0.42        50

    accuracy                           0.68       366
   macro avg       0.79      0.49      0.51       366
weighted avg       0.70      0.68      0.66       366



In [None]:
last_row = X_pca[-1].reshape(1, -1)

tomorrow_pred = knn.predict(last_row)[0]
tomorrow_pred

'Poor'