In [25]:
import numpy as np
import pandas as pd
import sys
import math
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

In [26]:
data_filename = 'AirQualityUCI.csv'
if sys.modules.get("google.colab") is None:
    data_path_prefix = "."
else:
    from google.colab import drive
    drive.mount("/content/drive")
    data_path_prefix = "/content/drive/MyDrive/MachineLearningAssignments/Assignment1"

data_path = f"{data_path_prefix}/{data_filename}"
print(f"Loading data from data path: {data_path}")
df = pd.read_csv(data_path, sep=';', na_values=-200)
df

Loading data from data path: ./AirQualityUCI.csv


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [27]:
# Show rows with missing values
df[df.loc[:,~df.columns.str.contains("Unnamed:")].isnull().any(axis=1)]

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
9,11/03/2004,03.00.00,06,1010.0,19.0,17,561.0,,1705.0,,1235.0,501.0,103,602,07517,,
10,11/03/2004,04.00.00,,1011.0,14.0,13,527.0,21.0,1818.0,34.0,1197.0,445.0,101,605,07465,,
33,12/03/2004,03.00.00,08,889.0,21.0,19,574.0,,1680.0,,1187.0,512.0,70,623,06261,,
34,12/03/2004,04.00.00,,831.0,10.0,11,506.0,21.0,1893.0,32.0,1134.0,384.0,61,659,06248,,
39,12/03/2004,09.00.00,,1545.0,,221,1353.0,,767.0,,2058.0,1588.0,92,562,06561,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [28]:
processed_df = df.copy()
processed_df.dropna(how='all', axis=1, inplace=True)
processed_df.dropna(how='all', axis=0, inplace=True)
for column in processed_df.columns[2:]:
    if processed_df[column].dtype == object:
        processed_df[column] = processed_df[column].str.replace(",", ".").astype(float)
processed_df["DateTime"] = pd.to_datetime(processed_df["Date"].str.cat(processed_df["Time"], sep=" "), format="%d/%m/%Y %H.%M.%S")
processed_df.drop(["Date", "Time"], axis=1, inplace=True)
# Show rows with missing values
processed_df[processed_df.isnull().any(axis=1)]

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,DateTime
9,0.6,1010.0,19.0,1.7,561.0,,1705.0,,1235.0,501.0,10.3,60.2,0.7517,2004-03-11 03:00:00
10,,1011.0,14.0,1.3,527.0,21.0,1818.0,34.0,1197.0,445.0,10.1,60.5,0.7465,2004-03-11 04:00:00
33,0.8,889.0,21.0,1.9,574.0,,1680.0,,1187.0,512.0,7.0,62.3,0.6261,2004-03-12 03:00:00
34,,831.0,10.0,1.1,506.0,21.0,1893.0,32.0,1134.0,384.0,6.1,65.9,0.6248,2004-03-12 04:00:00
39,,1545.0,,22.1,1353.0,,767.0,,2058.0,1588.0,9.2,56.2,0.6561,2004-03-12 09:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568,2005-04-04 10:00:00
9353,2.4,1163.0,,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119,2005-04-04 11:00:00
9354,2.4,1142.0,,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406,2005-04-04 12:00:00
9355,2.1,1003.0,,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139,2005-04-04 13:00:00


In [29]:
processed_df.isnull().sum()

CO(GT)           1592
PT08.S1(CO)       366
NMHC(GT)         8443
C6H6(GT)            0
PT08.S2(NMHC)     366
NOx(GT)          1639
PT08.S3(NOx)      366
NO2(GT)          1642
PT08.S4(NO2)      366
PT08.S5(O3)       366
T                 366
RH                366
AH                366
DateTime            0
dtype: int64

In [30]:
processed_df.dropna(axis=0, thresh=9, inplace=True)
processed_df.drop("DateTime", axis=1, inplace=True)
processed_df.drop("NMHC(GT)", axis=1, inplace=True)
processed_df.fillna(processed_df.mean(), inplace=True)
tag_column = "C6H6(GT)"
temp = processed_df.pop(tag_column)
processed_df[tag_column] = temp
processed_df

Unnamed: 0,CO(GT),PT08.S1(CO),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,C6H6(GT)
0,2.6,1360.0,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,11.9
1,2.0,1292.0,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,9.4
2,2.2,1402.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,9.0
3,2.2,1376.0,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,9.2
4,1.6,1272.0,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568,13.5
9353,2.4,1163.0,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119,11.4
9354,2.4,1142.0,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406,12.4
9355,2.1,1003.0,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139,9.5


In [31]:
def data_split(df):
    train_df, valid_and_test_df = train_test_split(df, train_size=0.7, random_state=0)
    valid_df, test_df = train_test_split(valid_and_test_df, train_size=0.5, random_state=0)
    return train_df, valid_df, test_df
train_df, valid_df, test_df = data_split(processed_df)
train_df

Unnamed: 0,CO(GT),PT08.S1(CO),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,C6H6(GT)
1020,-0.31738,977.0,800.0,242.189292,990.0,112.145137,1474.0,926.0,12.2,64.1,0.9062,5.7
3133,2.30000,1279.0,1197.0,250.000000,599.0,112.000000,1906.0,1518.0,22.9,43.0,1.1826,16.6
1163,0.50000,856.0,619.0,29.000000,1191.0,39.000000,1345.0,677.0,16.3,52.8,0.9722,2.5
3858,-0.31738,1029.0,791.0,242.189292,855.0,112.145137,1567.0,651.0,33.3,33.1,1.6642,5.6
6532,2.40000,1195.0,1013.0,355.000000,646.0,104.000000,1345.0,1276.0,14.5,62.3,1.0216,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4520,-0.31738,908.0,657.0,242.189292,1013.0,112.145137,1460.0,796.0,20.5,78.2,1.8678,3.1
8256,2.40000,1172.0,977.0,280.000000,657.0,177.000000,1160.0,1047.0,8.4,45.2,0.4977,10.0
5007,-0.31738,1337.0,1339.0,242.189292,525.0,112.145137,1870.0,1622.0,20.6,58.5,1.4054,21.6
3343,-0.31738,860.0,847.0,58.000000,873.0,77.000000,1426.0,748.0,23.6,31.9,0.9195,6.8


In [32]:
scaler = StandardScaler()
scaler.fit(train_df)
def scale_dataset(scaler, classification_func=None):
    def func(df, oversample=False):
        data = scaler.transform(df)
        X = data[:,:-1]
        y = data[:,-1]
        if classification_func is not None:
            y = np.array([classification_func(item) for item in y])
            if oversample:
                ros = RandomOverSampler(random_state=0)
                X, y = ros.fit_resample(X, y)
        return data, X, y
    return func

In [33]:
scale_classification = scale_dataset(scaler, classification_func=lambda x: 1 if x > 0 else 0)
train, train_X, train_y = scale_classification(train_df, oversample=True)
valid, valid_X, valid_y = scale_classification(valid_df)
test, test_X, test_y = scale_classification(test_df)

In [34]:
best_knn = None
print(f"Length of train data: {len(train_X)}")
for k in range(1, int(math.sqrt(len(train_X)))):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_X, train_y)
    predicted_y = knn.predict(valid_X)
    auc = roc_auc_score(valid_y, predicted_y)
    print(f"Try k={k} on valid data, AUC: {auc}")
    if best_knn is None or auc > best_knn[0]:
        best_knn = (auc, k, knn)
print(f"Best k: {best_knn[1]}, AUC on valid data: {best_knn[0]}")

Length of train data: 7530
Try k=1 on valid data, AUC: 0.9601114471326164
Try k=2 on valid data, AUC: 0.9504601627837514
Try k=3 on valid data, AUC: 0.9590205439814815
Try k=4 on valid data, AUC: 0.9607659890232976
Try k=5 on valid data, AUC: 0.9609736689814816
Try k=6 on valid data, AUC: 0.965809811827957
Try k=7 on valid data, AUC: 0.9682459677419355
Try k=8 on valid data, AUC: 0.961467200567503
Try k=9 on valid data, AUC: 0.9633631552419355
Try k=10 on valid data, AUC: 0.9589167040023895
Try k=11 on valid data, AUC: 0.9596179155465949
Try k=12 on valid data, AUC: 0.957124589307049
Try k=13 on valid data, AUC: 0.9631449746117086
Try k=14 on valid data, AUC: 0.9634203255675031
Try k=15 on valid data, AUC: 0.9674875858721624
Try k=16 on valid data, AUC: 0.9646150686977301
Try k=17 on valid data, AUC: 0.966131832437276
Try k=18 on valid data, AUC: 0.9620645721326165
Try k=19 on valid data, AUC: 0.961789221176822
Try k=20 on valid data, AUC: 0.958537513067503
Try k=21 on valid data, AUC:

In [35]:
k = best_knn[1]
knn = best_knn[2]
def evaluate_model(model, X, y):
    predicted_y = model.predict(X)
    auc = roc_auc_score(y, predicted_y)
    print(f"Model AUC: {auc}")
    print(classification_report(y, predicted_y))
    confusion_mat = confusion_matrix(y, predicted_y)
    print(f"Confusion matrix: \n{confusion_mat}")
print("Evaluating knn on train data:")
evaluate_model(knn, train_X, train_y)
print("Evaluating knn on test data:")
evaluate_model(knn, test_X, test_y)

Evaluating knn on train data:
Model AUC: 0.9784860557768924
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3765
           1       0.98      0.98      0.98      3765

    accuracy                           0.98      7530
   macro avg       0.98      0.98      0.98      7530
weighted avg       0.98      0.98      0.98      7530

Confusion matrix: 
[[3673   92]
 [  70 3695]]
Evaluating knn on test data:
Model AUC: 0.9622409359317073
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       822
           1       0.94      0.97      0.95       527

    accuracy                           0.96      1349
   macro avg       0.96      0.96      0.96      1349
weighted avg       0.96      0.96      0.96      1349

Confusion matrix: 
[[788  34]
 [ 18 509]]


In [36]:
lr = LogisticRegression()
lr.fit(train_X, train_y)
predicted_y = lr.predict(valid_X)
auc = roc_auc_score(valid_y, predicted_y)
print(f"AUC on valid data: {auc}")

print("Evaluating logistic regression on train data:")
evaluate_model(lr, train_X, train_y)
print("Evaluating logistic regression on test data:")
evaluate_model(lr, test_X, test_y)

AUC on valid data: 0.9914757877837516
Evaluating logistic regression on train data:
Model AUC: 0.9911022576361221
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3765
           1       0.99      0.99      0.99      3765

    accuracy                           0.99      7530
   macro avg       0.99      0.99      0.99      7530
weighted avg       0.99      0.99      0.99      7530

Confusion matrix: 
[[3719   46]
 [  21 3744]]
Evaluating logistic regression on test data:
Model AUC: 0.9920197417323415
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       822
           1       0.98      1.00      0.99       527

    accuracy                           0.99      1349
   macro avg       0.99      0.99      0.99      1349
weighted avg       0.99      0.99      0.99      1349

Confusion matrix: 
[[812  10]
 [  2 525]]
