In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/dataset_pollution.csv")

In [3]:
df.head(10)

Unnamed: 0,country,state,city,station,last_update,latitude,longitude,pollutant_id,pollutant_min,pollutant_max,pollutant_avg
0,India,Bihar,Motihari,"Gandak Colony, Motihari - BSPCB",19-01-2026 08:00,26.63086,84.90051,SO2,3.0,3.0,3.0
1,India,Bihar,Motihari,"Gandak Colony, Motihari - BSPCB",19-01-2026 08:00,26.63086,84.90051,OZONE,3.0,9.0,4.0
2,India,Bihar,Munger,"Town Hall, Munger - BSPCB",19-01-2026 08:00,25.376776,86.471523,PM2.5,30.0,120.0,74.0
3,India,Bihar,Munger,"Town Hall, Munger - BSPCB",19-01-2026 08:00,25.376776,86.471523,NH3,4.0,8.0,6.0
4,India,Bihar,Munger,"Town Hall, Munger - BSPCB",19-01-2026 08:00,25.376776,86.471523,OZONE,13.0,30.0,17.0
5,India,Bihar,Muzaffarpur,"Buddha Colony, Muzaffarpur - BSPCB",19-01-2026 08:00,26.11442,85.39813,PM2.5,76.0,300.0,179.0
6,India,Bihar,Muzaffarpur,"Buddha Colony, Muzaffarpur - BSPCB",19-01-2026 08:00,26.11442,85.39813,SO2,3.0,6.0,5.0
7,India,Bihar,Muzaffarpur,"Buddha Colony, Muzaffarpur - BSPCB",19-01-2026 08:00,26.11442,85.39813,CO,28.0,139.0,48.0
8,India,Bihar,Patna,"DRM Office Danapur, Patna - BSPCB",19-01-2026 08:00,25.586562,85.043586,PM2.5,71.0,330.0,185.0
9,India,Bihar,Patna,"DRM Office Danapur, Patna - BSPCB",19-01-2026 08:00,25.586562,85.043586,PM10,91.0,414.0,180.0


In [4]:
df.columns

Index(['country', 'state', 'city', 'station', 'last_update', 'latitude',
       'longitude', 'pollutant_id', 'pollutant_min', 'pollutant_max',
       'pollutant_avg'],
      dtype='object')

In [5]:
grouped_city = df.groupby('city')

In [6]:
res = grouped_city["pollutant_avg"].sum()

In [7]:
res

city
Agartala            0.0
Agra             1584.0
Ahmedabad        2210.0
Ahmednagar        260.0
Aizawl            129.0
                  ...  
Virar             253.0
Visakhapatnam     669.0
Vrindavan         229.0
Yadgir            203.0
Yamuna Nagar      407.0
Name: pollutant_avg, Length: 253, dtype: float64

In [8]:
modified_df = pd.DataFrame(res)

In [9]:
modified_df

Unnamed: 0_level_0,pollutant_avg
city,Unnamed: 1_level_1
Agartala,0.0
Agra,1584.0
Ahmedabad,2210.0
Ahmednagar,260.0
Aizawl,129.0
...,...
Virar,253.0
Visakhapatnam,669.0
Vrindavan,229.0
Yadgir,203.0


In [10]:
modified_df.sort_values(by="pollutant_avg")

Unnamed: 0_level_0,pollutant_avg
city,Unnamed: 1_level_1
Agartala,0.0
Chhapra,0.0
Damoh,0.0
Thanjavur,0.0
Nagaon,0.0
...,...
Bengaluru,3223.0
Kolkata,3872.0
Ghaziabad,4003.0
Mumbai,5419.0


In [11]:
def aqi_level(value):
    if value <= 50:
        return "Good"
    elif value <= 100:
        return "Moderate"
    elif value <= 200:
        return "Poor"
    else:
        return "Severe"

In [12]:
modified_df["AQI Category"] = modified_df["pollutant_avg"].apply(lambda x: aqi_level(x))

In [13]:
modified_df

Unnamed: 0_level_0,pollutant_avg,AQI Category
city,Unnamed: 1_level_1,Unnamed: 2_level_1
Agartala,0.0,Good
Agra,1584.0,Severe
Ahmedabad,2210.0,Severe
Ahmednagar,260.0,Severe
Aizawl,129.0,Poor
...,...,...
Virar,253.0,Severe
Visakhapatnam,669.0,Severe
Vrindavan,229.0,Severe
Yadgir,203.0,Severe


In [14]:
def safety_level(value):
    if value < 200:
        return "Safe"
    else:
        return "Unsafe"

In [15]:
modified_df["Safety"] = modified_df["pollutant_avg"].apply(lambda x: safety_level(x))

In [16]:
modified_df

Unnamed: 0_level_0,pollutant_avg,AQI Category,Safety
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Agartala,0.0,Good,Safe
Agra,1584.0,Severe,Unsafe
Ahmedabad,2210.0,Severe,Unsafe
Ahmednagar,260.0,Severe,Unsafe
Aizawl,129.0,Poor,Safe
...,...,...,...
Virar,253.0,Severe,Unsafe
Visakhapatnam,669.0,Severe,Unsafe
Vrindavan,229.0,Severe,Unsafe
Yadgir,203.0,Severe,Unsafe


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [18]:
X = modified_df[["pollutant_avg"]]

In [19]:
le_aqi = LabelEncoder()
y_aqi = le_aqi.fit_transform(modified_df["AQI Category"])

In [20]:
le_safe = LabelEncoder()
y_safe = le_safe.fit_transform(modified_df["Safety"])

In [21]:
X_train_aqi, X_test_aqi, y_train_aqi, y_test_aqi = train_test_split(
    X, y_aqi, test_size=0.2, random_state=42
)

In [22]:
X_train_safe, X_test_safe, y_train_safe, y_test_safe = train_test_split(
    X, y_safe, test_size=0.2, random_state=42
)

In [23]:
scaler = StandardScaler()

X_train_aqi = scaler.fit_transform(X_train_aqi)
X_test_aqi = scaler.transform(X_test_aqi)

X_train_safe = scaler.fit_transform(X_train_safe)
X_test_safe = scaler.transform(X_test_safe)

In [24]:
knn_aqi = KNeighborsClassifier(n_neighbors=5)
knn_aqi.fit(X_train_aqi, y_train_aqi)

y_pred_knn_aqi = knn_aqi.predict(X_test_aqi)

In [25]:
knn_safe = KNeighborsClassifier(n_neighbors=5)
knn_safe.fit(X_train_safe, y_train_safe)

y_pred_knn_safe = knn_safe.predict(X_test_safe)

In [26]:
print("k-NN AQI Accuracy:", accuracy_score(y_test_aqi, y_pred_knn_aqi))
print(classification_report(y_test_aqi, y_pred_knn_aqi,
      target_names=le_aqi.classes_, zero_division=0))
print("k-NN Safety Accuracy:", accuracy_score(y_test_safe, y_pred_knn_safe))
print(classification_report(y_test_safe, y_pred_knn_safe,
      target_names=le_safe.classes_, zero_division=0))

k-NN AQI Accuracy: 0.9607843137254902
              precision    recall  f1-score   support

        Good       1.00      1.00      1.00         1
    Moderate       0.00      0.00      0.00         2
        Poor       0.82      1.00      0.90         9
      Severe       1.00      1.00      1.00        39

    accuracy                           0.96        51
   macro avg       0.70      0.75      0.72        51
weighted avg       0.93      0.96      0.94        51

k-NN Safety Accuracy: 0.9803921568627451
              precision    recall  f1-score   support

        Safe       0.92      1.00      0.96        11
      Unsafe       1.00      0.97      0.99        40

    accuracy                           0.98        51
   macro avg       0.96      0.99      0.97        51
weighted avg       0.98      0.98      0.98        51



In [27]:
lr_aqi = LogisticRegression(max_iter=1000)
lr_aqi.fit(X_train_aqi, y_train_aqi)

y_pred_lr_aqi = lr_aqi.predict(X_test_aqi)

lr_safe = LogisticRegression()
lr_safe.fit(X_train_safe, y_train_safe)

y_pred_lr_safe = lr_safe.predict(X_test_safe)

In [28]:
print("Logistic Regression AQI Accuracy:", accuracy_score(y_test_aqi, y_pred_lr_aqi))
print(classification_report(y_test_aqi, y_pred_lr_aqi,
      target_names=le_aqi.classes_, zero_division=0))
print("Logistic Regression Safety Accuracy:", accuracy_score(y_test_safe, y_pred_lr_safe))
print(classification_report(y_test_safe, y_pred_lr_safe,
      target_names=le_safe.classes_, zero_division=0))

Logistic Regression AQI Accuracy: 0.7647058823529411
              precision    recall  f1-score   support

        Good       0.00      0.00      0.00         1
    Moderate       0.00      0.00      0.00         2
        Poor       0.00      0.00      0.00         9
      Severe       0.76      1.00      0.87        39

    accuracy                           0.76        51
   macro avg       0.19      0.25      0.22        51
weighted avg       0.58      0.76      0.66        51

Logistic Regression Safety Accuracy: 0.7843137254901961
              precision    recall  f1-score   support

        Safe       0.00      0.00      0.00        11
      Unsafe       0.78      1.00      0.88        40

    accuracy                           0.78        51
   macro avg       0.39      0.50      0.44        51
weighted avg       0.62      0.78      0.69        51



In [29]:
modified_df["Predicted_AQI_kNN"] = le_aqi.inverse_transform(
    knn_aqi.predict(scaler.transform(X))
)

modified_df["Predicted_Safety_LR"] = le_safe.inverse_transform(
    lr_safe.predict(scaler.transform(X))
)

In [30]:
modified_df

Unnamed: 0_level_0,pollutant_avg,AQI Category,Safety,Predicted_AQI_kNN,Predicted_Safety_LR
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Agartala,0.0,Good,Safe,Good,Unsafe
Agra,1584.0,Severe,Unsafe,Severe,Unsafe
Ahmedabad,2210.0,Severe,Unsafe,Severe,Unsafe
Ahmednagar,260.0,Severe,Unsafe,Severe,Unsafe
Aizawl,129.0,Poor,Safe,Poor,Unsafe
...,...,...,...,...,...
Virar,253.0,Severe,Unsafe,Severe,Unsafe
Visakhapatnam,669.0,Severe,Unsafe,Severe,Unsafe
Vrindavan,229.0,Severe,Unsafe,Severe,Unsafe
Yadgir,203.0,Severe,Unsafe,Severe,Unsafe


In [32]:
modified_df["Safety"].value_counts()

Safety
Unsafe    206
Safe       47
Name: count, dtype: int64

In [33]:
modified_df["Predicted_Safety_LR"].value_counts()

Predicted_Safety_LR
Unsafe    253
Name: count, dtype: int64

In [34]:
modified_df["AQI Category"].value_counts()

AQI Category
Severe      205
Poor         32
Good         13
Moderate      3
Name: count, dtype: int64