In [27]:
import pandas as pd
df = pd.read_csv('../../data/preprocessed_AQI_data.csv')
df.head()

Unnamed: 0,Country,AQI Category,CO AQI Value,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,PM2.5 AQI Value,PM2.5 AQI Category
0,134,2,1,36,0,0,51,2
1,23,0,1,5,0,1,41,0
2,77,2,1,39,0,2,66,2
3,126,0,1,34,0,0,20,0
4,176,2,1,14,0,11,54,2


In [28]:
from sklearn.model_selection import train_test_split

# shuffle the data before splitting
df_shuffled = df.sample(frac=1, random_state=20)  
X = df_shuffled.drop(["AQI Category"], axis=1)
y = df_shuffled["AQI Category"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=22
)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


(2845, 7) (11384, 7)
(2845,) (11384,)


In [29]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

In [30]:
X_shifted = X - X.min() + 1  #to make sure all points are > 0

X_train, X_test, y_train, y_test = train_test_split(X_shifted, y, test_size=0.8, random_state=22)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.70      0.73      5327
           1       0.63      0.57      0.60        42
           2       0.69      0.46      0.55      4704
           3       0.19      0.57      0.29       618
           4       0.17      0.32      0.22       594
           5       0.06      0.25      0.10        99

    accuracy                           0.57     11384
   macro avg       0.42      0.48      0.42     11384
weighted avg       0.67      0.57      0.60     11384



In [31]:
from sklearn.naive_bayes import ComplementNB

Complement Naive Bayes: It is an adaptation of Multinomial NB where the complement of each class is used to calculate the model weights. So, this is suitable for imbalanced data sets and often outperforms the MNB on text classification tasks.

In [32]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(X_shifted, y, test_size=0.8, random_state=22)

cnb_classifier = ComplementNB()
cnb_classifier.fit(X_train, y_train)

y_pred = cnb_classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.78      0.71      5327
           1       0.00      0.00      0.00        42
           2       0.66      0.07      0.13      4704
           3       0.13      0.97      0.23       618
           4       0.00      0.00      0.00       594
           5       0.00      0.00      0.00        99

    accuracy                           0.45     11384
   macro avg       0.24      0.30      0.18     11384
weighted avg       0.58      0.45      0.40     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import ComplementNB

# hyperparams tuning
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  #smoothing param alpha
    'fit_prior': [True, False],  
}

cnb_classifier = ComplementNB()
grid_search = GridSearchCV(estimator=cnb_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
print("Best Parameters:", best_params)


Best Parameters: {'alpha': 1.5, 'fit_prior': True}


In [34]:
best_model = grid_search.best_estimator_


y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.87      1.00      0.93      5327
           1       0.00      0.00      0.00        42
           2       0.78      0.75      0.77      4704
           3       0.32      0.40      0.35       618
           4       0.00      0.00      0.00       594
           5       0.00      0.00      0.00        99

    accuracy                           0.80     11384
   macro avg       0.33      0.36      0.34     11384
weighted avg       0.75      0.80      0.77     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [35]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train, X_test, y_train, y_test = train_test_split(X_shifted, y, test_size=0.8, random_state=22)
cnb_classifier = ComplementNB(alpha=0.1, fit_prior=True)
cnb_classifier.fit(X_train_scaled, y_train)
y_pred = cnb_classifier.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      5327
           1       0.00      0.00      0.00        42
           2       0.78      0.75      0.77      4704
           3       0.31      0.40      0.35       618
           4       0.00      0.00      0.00       594
           5       0.00      0.00      0.00        99

    accuracy                           0.80     11384
   macro avg       0.33      0.36      0.34     11384
weighted avg       0.75      0.80      0.77     11384



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
from sklearn.metrics import accuracy_score
acc  = accuracy_score(y_pred, y_test)
print(f'The accuracy score : {acc}')

The accuracy score : 0.7969079409697821


In [37]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score, accuracy_score

# here we define different scorers metrics
scorer_f1 = make_scorer(f1_score, average='weighted')
scorer_precision = make_scorer(precision_score, average='weighted', zero_division=1)
scorer_recall = make_scorer(recall_score, average='weighted')
scorer_accuracy = make_scorer(accuracy_score)

# we save them in a dict for easier accessing
scorers = {'f1': scorer_f1, 'precision': scorer_precision, 'accuracy': scorer_accuracy, 'recall': scorer_recall}
scores_results : dict = {}
for scorer_name, scorer in scorers.items():
    scores = cross_val_score(cnb_classifier, X_train, y_train, cv=7, scoring=scorer)
    print(f"{scorer_name.capitalize()} scores:", scores)
    scores_results[scorer_name] = round(scores.mean(), 4)
    print(f"{scorer_name.capitalize()} mean:", round(scores.mean(), 4),'\n')

F1 scores: [0.40470334 0.38666046 0.37422171 0.42144858 0.40055009 0.41906709
 0.39955283]
F1 mean: 0.4009 

Precision scores: [0.59826145 0.58083462 0.64476458 0.68830024 0.65865429 0.67064869
 0.58020296]
Precision mean: 0.6317 

Accuracy scores: [0.44226044 0.45208845 0.42997543 0.47044335 0.45812808 0.48768473
 0.45812808]
Accuracy mean: 0.457 

Recall scores: [0.44226044 0.45208845 0.42997543 0.47044335 0.45812808 0.48768473
 0.45812808]
Recall mean: 0.457 



In [38]:
acc_sc= make_scorer(accuracy_score)
scoores = cross_val_score(cnb_classifier, X_train, y_train, cv=7, scoring=acc_sc)
scoores

array([0.44226044, 0.45208845, 0.42997543, 0.47044335, 0.45812808,
       0.48768473, 0.45812808])

In [39]:
import pandas as pd
scores_df = pd.DataFrame.from_dict(scores_results, orient='index', columns=['Scores'])
scores_df

Unnamed: 0,Scores
f1,0.4009
precision,0.6317
accuracy,0.457
recall,0.457
