In [1]:
import pandas as pd
df = pd.read_csv('../../data/preprocessed_AQI_data.csv')
df.head()

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category,lat,lng
0,130,10126,51,2,1,0,36,0,0,0,51,2,44.7444,44.2031
1,22,10140,41,0,1,0,5,0,1,0,41,0,-5.29,-44.49
2,75,10163,66,2,1,0,39,0,2,0,66,2,37.1667,15.1833
3,123,10185,34,0,1,0,34,0,0,0,20,0,53.0167,20.8833
4,166,10243,54,2,1,0,14,0,11,0,54,2,16.1005,-88.8074


In [2]:
from sklearn.model_selection import train_test_split

# shuffle the data before splitting
df_shuffled = df.sample(frac=1, random_state=19)  
X = df_shuffled.drop(["AQI Category"], axis=1)
y = df_shuffled["AQI Category"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=22
)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

In [None]:
X_shifted = X - X.min() + 1  # make sure all points are > 0

X_train, X_test, y_train, y_test = train_test_split(X_shifted, y, test_size=0.8, random_state=22)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.68      0.70      5332
           1       0.02      0.83      0.04        36
           2       0.63      0.37      0.46      4679
           3       0.35      0.32      0.34       633
           4       0.10      0.28      0.15       604
           5       0.13      0.11      0.12       100

    accuracy                           0.51     11384
   macro avg       0.33      0.43      0.30     11384
weighted avg       0.62      0.51      0.55     11384



In [None]:
from sklearn.naive_bayes import ComplementNB

Complement Naive Bayes: It is an adaptation of Multinomial NB where the complement of each class is used to calculate the model weights. So, this is suitable for imbalanced data sets and often outperforms the MNB on text classification tasks.

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(X_shifted, y, test_size=0.8, random_state=22)

cnb_classifier = ComplementNB()
cnb_classifier.fit(X_train, y_train)

y_pred = cnb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.76      0.68      5332
           1       0.00      0.00      0.00        36
           2       0.58      0.18      0.28      4679
           3       0.17      0.92      0.29       633
           4       0.00      0.00      0.00       604
           5       0.00      0.00      0.00       100

    accuracy                           0.48     11384
   macro avg       0.23      0.31      0.21     11384
weighted avg       0.54      0.48      0.45     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import ComplementNB

# hyperparams tuning
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  #smoothing param alpha
    'fit_prior': [True, False],  
}

cnb_classifier = ComplementNB()
grid_search = GridSearchCV(estimator=cnb_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)


Best Parameters: {'alpha': 0.1, 'fit_prior': True}


In [None]:
best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5332
           1       0.25      0.06      0.09        36
           2       0.80      0.86      0.83      4679
           3       0.31      0.33      0.32       633
           4       0.00      0.00      0.00       604
           5       0.00      0.00      0.00       100

    accuracy                           0.84     11384
   macro avg       0.38      0.38      0.37     11384
weighted avg       0.79      0.84      0.81     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(X_shifted, y, test_size=0.8, random_state=22)
cnb_classifier = ComplementNB(alpha=0.1, fit_prior=True)
cnb_classifier.fit(X_train_scaled, y_train)
y_pred = cnb_classifier.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5332
           1       0.25      0.06      0.09        36
           2       0.80      0.86      0.83      4679
           3       0.31      0.33      0.32       633
           4       0.00      0.00      0.00       604
           5       0.00      0.00      0.00       100

    accuracy                           0.84     11384
   macro avg       0.38      0.38      0.37     11384
weighted avg       0.79      0.84      0.81     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import accuracy_score
acc  = accuracy_score(y_pred, y_test)
print(f'The accuracy score : {acc}')

The accuracy score : 0.8411806043569923
