In [87]:
import pandas as pd

df = pd.read_csv("../../data/preprocessed_AQI_data.csv")
df.head()

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,PM2.5 AQI Value,PM2.5 AQI Category,lat,lng
0,130,10126,51,2,1,0,36,0,0,0,51,2,44.7444,44.2031
1,22,10140,41,0,1,0,5,0,1,0,41,0,-5.29,-44.49
2,75,10163,66,2,1,0,39,0,2,0,66,2,37.1667,15.1833
3,123,10185,34,0,1,0,34,0,0,0,20,0,53.0167,20.8833
4,166,10243,54,2,1,0,14,0,11,0,54,2,16.1005,-88.8074


In [88]:
from sklearn.model_selection import train_test_split

# shuffle the data before splitting
df_shuffled = df.sample(frac=1, random_state=19)
X = df_shuffled.drop(["AQI Category"], axis=1)
y = df_shuffled["AQI Category"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=22
)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(2845, 13) (11384, 13)
(2845,) (11384,)


In [89]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

In [90]:
X_shifted = X - X.min() + 1  # make sure all points are > 0

X_train, X_test, y_train, y_test = train_test_split(
    X_shifted, y, test_size=0.8, random_state=22
)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.68      0.70      5301
           1       0.03      0.76      0.05        41
           2       0.64      0.35      0.46      4741
           3       0.21      0.28      0.24       618
           4       0.10      0.24      0.14       588
           5       0.04      0.20      0.06        95

    accuracy                           0.49     11384
   macro avg       0.29      0.42      0.28     11384
weighted avg       0.62      0.49      0.54     11384



In [91]:
from sklearn.naive_bayes import ComplementNB

Complement Naive Bayes: It is an adaptation of Multinomial NB where the complement of each class is used to calculate the model weights. So, this is suitable for imbalanced data sets and often outperforms the MNB on text classification tasks.

In [92]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(
    X_shifted, y, test_size=0.8, random_state=22
)

cnb_classifier = ComplementNB()
cnb_classifier.fit(X_train, y_train)

y_pred = cnb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.75      0.67      5301
           1       0.00      0.00      0.00        41
           2       0.62      0.13      0.22      4741
           3       0.15      0.95      0.26       618
           4       0.00      0.00      0.00       588
           5       0.00      0.00      0.00        95

    accuracy                           0.45     11384
   macro avg       0.23      0.30      0.19     11384
weighted avg       0.55      0.45      0.42     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [93]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import ComplementNB

# hyperparams tuning
param_grid = {
    "alpha": [0.1, 0.5, 1.0, 1.5, 2.0],  # smoothing param alpha
    "fit_prior": [True, False],
}

cnb_classifier = ComplementNB()
grid_search = GridSearchCV(estimator=cnb_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'alpha': 0.1, 'fit_prior': True}


In [94]:
best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      5301
           1       0.00      0.00      0.00        41
           2       0.80      0.83      0.82      4741
           3       0.33      0.39      0.36       618
           4       0.00      0.00      0.00       588
           5       0.50      0.03      0.06        95

    accuracy                           0.83     11384
   macro avg       0.43      0.38      0.37     11384
weighted avg       0.79      0.83      0.81     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [95]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(
    X_shifted, y, test_size=0.8, random_state=22
)
cnb_classifier = ComplementNB(alpha=0.1, fit_prior=True)
cnb_classifier.fit(X_train_scaled, y_train)
y_pred = cnb_classifier.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96      5301
           1       0.00      0.00      0.00        41
           2       0.80      0.83      0.82      4741
           3       0.33      0.39      0.36       618
           4       0.00      0.00      0.00       588
           5       0.50      0.03      0.06        95

    accuracy                           0.83     11384
   macro avg       0.43      0.38      0.37     11384
weighted avg       0.79      0.83      0.81     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [96]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_pred, y_test)
print(f"The accuracy score : {acc}")

The accuracy score : 0.8338896697118763


In [97]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    make_scorer,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
)

# here we define different scorers metrics
scorer_f1 = make_scorer(f1_score, average="weighted")
scorer_precision = make_scorer(precision_score, average="weighted", zero_division=1)
scorer_recall = make_scorer(recall_score, average="weighted")
scorer_accuracy = make_scorer(accuracy_score)

# Cwe save them in a dict for easier accessing
scorers = {
    "f1": scorer_f1,
    "precision": scorer_precision,
    "accuracy": scorer_accuracy,
    "recall": scorer_recall,
}
scores_results: dict = {}
for scorer_name, scorer in scorers.items():
    scores = cross_val_score(cnb_classifier, X_train, y_train, cv=7, scoring=scorer)
    print(f"{scorer_name.capitalize()} scores:", scores)
    scores_results[scorer_name] = round(scores.mean(), 4)
    print(f"{scorer_name.capitalize()} mean:", round(scores.mean(), 4), "\n")

F1 scores: [0.40965853 0.42782654 0.42375549 0.44858511 0.44583046 0.41600882
 0.41148262]
F1 mean: 0.4262 

Precision scores: [0.60305118 0.58202182 0.55356814 0.58971916 0.63104935 0.61502466
 0.60450048]
Precision mean: 0.597 

Accuracy scores: [0.47174447 0.45945946 0.44471744 0.49261084 0.4679803  0.44334975
 0.45566502]
Accuracy mean: 0.4622 

Recall scores: [0.47174447 0.45945946 0.44471744 0.49261084 0.4679803  0.44334975
 0.45566502]
Recall mean: 0.4622 



In [98]:
acc_sc = make_scorer(accuracy_score)
scoores = cross_val_score(cnb_classifier, X_train, y_train, cv=7, scoring=acc_sc)
scoores

array([0.47174447, 0.45945946, 0.44471744, 0.49261084, 0.4679803 ,
       0.44334975, 0.45566502])

In [99]:
import pandas as pd

scores_df = pd.DataFrame.from_dict(scores_results, orient="index", columns=["Scores"])
scores_df

Unnamed: 0,Scores
f1,0.4262
precision,0.597
accuracy,0.4622
recall,0.4622
