In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = pd.read_csv("../../data/preprocessed_AQI_data.csv")
df.head()

Unnamed: 0,Country,AQI Category,CO AQI Value,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,PM2.5 AQI Value,PM2.5 AQI Category
0,134,2,1,36,0,0,51,2
1,23,0,1,5,0,1,41,0
2,77,2,1,39,0,2,66,2
3,126,0,1,34,0,0,20,0
4,176,2,1,14,0,11,54,2


In [3]:
from sklearn.model_selection import train_test_split

# shuffle the data before splitting
df_shuffled = df.sample(frac=1, random_state=19)
X = df_shuffled.drop(["AQI Category"], axis=1)
y = df_shuffled["AQI Category"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.8, random_state=22
)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(2845, 7) (11384, 7)
(2845,) (11384,)


In [4]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

mm = MinMaxScaler()
ss = StandardScaler()
X_scaled = ss.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.8, random_state=22
)

In [5]:
lda_classifier = LinearDiscriminantAnalysis()
lda_classifier.fit(X_train, y_train)
y_pred = lda_classifier.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      5301
           1       1.00      0.85      0.92        41
           2       1.00      0.97      0.98      4741
           3       0.94      0.96      0.95       618
           4       0.97      0.96      0.96       588
           5       0.91      0.84      0.87        95

    accuracy                           0.98     11384
   macro avg       0.96      0.93      0.95     11384
weighted avg       0.98      0.98      0.98     11384



In [6]:
from sklearn.model_selection import GridSearchCV

lda_classifier = LinearDiscriminantAnalysis()
param_grid = {
    "solver": ["svd", "lsqr", "eigen"],
    "shrinkage": ["auto", None],
}

In [7]:
import warnings

warnings.filterwarnings("ignore")

In [8]:
grid_search = GridSearchCV(estimator=lda_classifier, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [9]:
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'shrinkage': None, 'solver': 'svd'}


In [10]:
best_lda = LinearDiscriminantAnalysis(shrinkage=None, solver="svd")
best_lda.fit(X_train, y_train)

In [11]:
y_pred = best_lda.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      5301
           1       1.00      0.85      0.92        41
           2       1.00      0.97      0.98      4741
           3       0.94      0.96      0.95       618
           4       0.97      0.96      0.96       588
           5       0.91      0.84      0.87        95

    accuracy                           0.98     11384
   macro avg       0.96      0.93      0.95     11384
weighted avg       0.98      0.98      0.98     11384



Best Parameters: {'shrinkage': None, 'solver': 'svd'}


In [12]:
from sklearn.metrics import accuracy_score

lda_classifier_best = LinearDiscriminantAnalysis(shrinkage=None, solver="svd")
lda_classifier_best.fit(X_train, y_train)
y_pred = lda_classifier_best.predict(X_test)
cr = classification_report(y_test, y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      5301
           1       1.00      0.85      0.92        41
           2       1.00      0.97      0.98      4741
           3       0.94      0.96      0.95       618
           4       0.97      0.96      0.96       588
           5       0.91      0.84      0.87        95

    accuracy                           0.98     11384
   macro avg       0.96      0.93      0.95     11384
weighted avg       0.98      0.98      0.98     11384



In [13]:
acc = accuracy_score(y_pred, y_test)
acc

0.9798840477863668

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import (
    make_scorer,
    f1_score,
    precision_score,
    recall_score,
    accuracy_score,
)

# here we define different scorers metrics
scorer_f1 = make_scorer(f1_score, average="weighted")
scorer_precision = make_scorer(precision_score, average="weighted", zero_division=1)
scorer_recall = make_scorer(recall_score, average="weighted")
scorer_accuracy = make_scorer(accuracy_score)

# Cwe save them in a dict for easier accessing
scorers = {
    "f1": scorer_f1,
    "precision": scorer_precision,
    "accuracy": scorer_accuracy,
    "recall": scorer_recall,
}
scores_results: dict = {}
for scorer_name, scorer in scorers.items():
    scores = cross_val_score(best_lda, X_train, y_train, cv=7, scoring=scorer)
    print(f"{scorer_name.capitalize()} scores:", scores)
    scores_results[scorer_name] = round(scores.mean(), 4)
    print(f"{scorer_name.capitalize()} mean:", round(scores.mean(), 4), "\n")

F1 scores: [0.96689419 0.98253877 0.98524581 0.99013558 0.97282954 0.99014018
 0.96703168]
F1 mean: 0.9793 

Precision scores: [0.97049454 0.98314636 0.98545455 0.99034483 0.97429489 0.99036683
 0.96939879]
Precision mean: 0.9805 

Accuracy scores: [0.96805897 0.98280098 0.98525799 0.99014778 0.9729064  0.99014778
 0.9679803 ]
Accuracy mean: 0.9796 

Recall scores: [0.96805897 0.98280098 0.98525799 0.99014778 0.9729064  0.99014778
 0.9679803 ]
Recall mean: 0.9796 



In [15]:
import pandas as pd

scores_df = pd.DataFrame.from_dict(scores_results, orient="index", columns=["Scores"])
scores_df

Unnamed: 0,Scores
f1,0.9793
precision,0.9805
accuracy,0.9796
recall,0.9796
