In [93]:
import numpy as np
import pandas as pd
import sys
import math
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import RandomOverSampler

In [94]:
data_filename = 'AirQualityUCI.csv'
if sys.modules.get("google.colab") is None:
    data_path_prefix = "."
else:
    from google.colab import drive
    drive.mount("/content/drive")
    data_path_prefix = "/content/drive/Mydrive/MachineLearningAssignments/Assignment1"

data_path = f"{data_path_prefix}/{data_filename}"
print(f"Loading data from data path: {data_path}")
df = pd.read_csv(data_path, sep=';', na_values=-200)
df

Loading data from data path: ./AirQualityUCI.csv


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [95]:
# Show rows with missing values
df[df.loc[:,~df.columns.str.contains("Unnamed:")].isnull().any(axis=1)]

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
9,11/03/2004,03.00.00,06,1010.0,19.0,17,561.0,,1705.0,,1235.0,501.0,103,602,07517,,
10,11/03/2004,04.00.00,,1011.0,14.0,13,527.0,21.0,1818.0,34.0,1197.0,445.0,101,605,07465,,
33,12/03/2004,03.00.00,08,889.0,21.0,19,574.0,,1680.0,,1187.0,512.0,70,623,06261,,
34,12/03/2004,04.00.00,,831.0,10.0,11,506.0,21.0,1893.0,32.0,1134.0,384.0,61,659,06248,,
39,12/03/2004,09.00.00,,1545.0,,221,1353.0,,767.0,,2058.0,1588.0,92,562,06561,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [96]:
processed_df = df.copy()
processed_df.dropna(how='all', axis=1, inplace=True)
processed_df.dropna(how='all', axis=0, inplace=True)
for column in processed_df.columns[2:]:
    if processed_df[column].dtype == object:
        processed_df[column] = processed_df[column].str.replace(",", ".").astype(float)
processed_df["DateTime"] = pd.to_datetime(processed_df["Date"].str.cat(processed_df["Time"], sep=" "), format="%d/%m/%Y %H.%M.%S")
processed_df.drop(["Date", "Time"], axis=1, inplace=True)
# Show rows with missing values
processed_df[processed_df.isnull().any(axis=1)]

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,DateTime
9,0.6,1010.0,19.0,1.7,561.0,,1705.0,,1235.0,501.0,10.3,60.2,0.7517,2004-03-11 03:00:00
10,,1011.0,14.0,1.3,527.0,21.0,1818.0,34.0,1197.0,445.0,10.1,60.5,0.7465,2004-03-11 04:00:00
33,0.8,889.0,21.0,1.9,574.0,,1680.0,,1187.0,512.0,7.0,62.3,0.6261,2004-03-12 03:00:00
34,,831.0,10.0,1.1,506.0,21.0,1893.0,32.0,1134.0,384.0,6.1,65.9,0.6248,2004-03-12 04:00:00
39,,1545.0,,22.1,1353.0,,767.0,,2058.0,1588.0,9.2,56.2,0.6561,2004-03-12 09:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568,2005-04-04 10:00:00
9353,2.4,1163.0,,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119,2005-04-04 11:00:00
9354,2.4,1142.0,,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406,2005-04-04 12:00:00
9355,2.1,1003.0,,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139,2005-04-04 13:00:00


In [97]:
processed_df.isnull().sum()

CO(GT)           1592
PT08.S1(CO)       366
NMHC(GT)         8443
C6H6(GT)            0
PT08.S2(NMHC)     366
NOx(GT)          1639
PT08.S3(NOx)      366
NO2(GT)          1642
PT08.S4(NO2)      366
PT08.S5(O3)       366
T                 366
RH                366
AH                366
DateTime            0
dtype: int64

In [98]:
processed_df.drop("DateTime", axis=1, inplace=True)
processed_df.fillna(processed_df.mean(), inplace=True)
tag_column = "C6H6(GT)"
temp = processed_df.pop(tag_column)
processed_df[tag_column] = temp
processed_df

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,C6H6(GT)
0,2.6,1360.0,150.000000,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,11.9
1,2.0,1292.0,112.000000,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,9.4
2,2.2,1402.0,88.000000,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,9.0
3,2.2,1376.0,80.000000,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,9.2
4,1.6,1272.0,51.000000,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,218.811816,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568,13.5
9353,2.4,1163.0,218.811816,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119,11.4
9354,2.4,1142.0,218.811816,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406,12.4
9355,2.1,1003.0,218.811816,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139,9.5


In [99]:
def data_split(df):
    train_df, valid_and_test_df = train_test_split(df, train_size=0.7, random_state=0)
    valid_df, test_df = train_test_split(valid_and_test_df, train_size=0.5, random_state=0)
    return train_df, valid_df, test_df
train_df, valid_df, test_df = data_split(processed_df)
train_df

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,C6H6(GT)
8772,0.6,928.0,218.811816,631.0,148.0,985.0,89.0,908.0,884.0,4.5,56.3,0.4800,2.7
3594,1.2,929.0,218.811816,811.0,57.0,862.0,67.0,1541.0,642.0,35.5,29.8,1.7004,6.0
3197,3.5,1239.0,218.811816,1129.0,121.0,637.0,106.0,1839.0,1308.0,28.9,39.1,1.5370,14.4
2819,0.7,913.0,218.811816,743.0,65.0,931.0,66.0,1540.0,884.0,21.8,52.3,1.3539,4.6
9031,1.5,1069.0,218.811816,799.0,181.0,682.0,121.0,1211.0,1044.0,12.6,67.9,0.9886,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7891,2.6,1037.0,218.811816,981.0,388.0,721.0,139.0,1000.0,1086.0,13.7,20.7,0.3221,10.1
9225,0.5,826.0,218.811816,512.0,84.0,1108.0,61.0,1043.0,636.0,12.5,67.4,0.9722,1.1
4859,0.4,769.0,218.811816,586.0,60.0,1221.0,45.0,1134.0,650.0,17.1,52.5,1.0130,2.0
3264,1.3,1034.0,218.811816,812.0,70.0,866.0,82.0,1563.0,660.0,32.3,33.4,1.5900,6.0


In [100]:
scaler = StandardScaler()
scaler.fit(train_df)
def scale_dataset(scaler, classification_func=None):
    def func(df, oversample=False):
        data = scaler.transform(df)
        X = data[:,:-1]
        y = data[:,-1]
        if classification_func is not None:
            y = np.array([classification_func(item) for item in y])
            if oversample:
                ros = RandomOverSampler(random_state=0)
                X, y = ros.fit_resample(X, y)
        return data, X, y
    return func

In [104]:
scale_air_quality_classification = scale_dataset(scaler, classification_func=lambda x: 1 if x > 0 else 0)
train, train_X, train_y = scale_air_quality_classification(train_df, oversample=True)
valid, valid_X, valid_y = scale_air_quality_classification(valid_df)
test, test_X, test_y = scale_air_quality_classification(test_df)

In [None]:
best_knn = None
for k in range(1, math.sqrt(len(train_X)).astype(int)):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train_X, train_y)
    predicted_y = knn.predict(valid_X)
    auc = roc_auc_score(valid_y, predicted_y)
    if best_knn is None or auc > best_knn[0]:
        best_knn = (auc, k, knn)
print(f"Best k: {best_knn[1]} with AUC: {best_knn[0]}")