In [1]:
# import the required machine learning libraries and models
import numpy as np
import pandas as pd
import sys
import math
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Read the file and delete blank values according to the prompts
data_filename = 'AirQualityUCI.csv'

if sys.modules.get("google.colab") is None:
    data_path_prefix = "."
else:
    from google.colab import drive
    drive.mount("/content/drive")
    data_path_prefix = "/content/drive/MyDrive/MachineLearningAssignment/Assignment1"

data_path = f"{data_path_prefix}/{data_filename}"

print(f"Loading data from data path: {data_path}")

df = pd.read_csv(data_path, sep=';', na_values=-200)

df


Mounted at /content/drive
Loading data from data path: /content/drive/MyDrive/MachineLearningAssignment/Assignment1/AirQualityUCI.csv


Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [3]:
# Show rows with missing values
df[df.loc[:,~df.columns.str.contains("Unnamed:")].isnull().any(axis=1)]

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
9,11/03/2004,03.00.00,06,1010.0,19.0,17,561.0,,1705.0,,1235.0,501.0,103,602,07517,,
10,11/03/2004,04.00.00,,1011.0,14.0,13,527.0,21.0,1818.0,34.0,1197.0,445.0,101,605,07465,,
33,12/03/2004,03.00.00,08,889.0,21.0,19,574.0,,1680.0,,1187.0,512.0,70,623,06261,,
34,12/03/2004,04.00.00,,831.0,10.0,11,506.0,21.0,1893.0,32.0,1134.0,384.0,61,659,06248,,
39,12/03/2004,09.00.00,,1545.0,,221,1353.0,,767.0,,2058.0,1588.0,92,562,06561,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [4]:
# The first step of preprocessing the data
processed_df = df.copy()

processed_df.dropna(how='all', axis=1, inplace=True)
processed_df.dropna(how='all', axis=0, inplace=True)

for column in processed_df.columns[2:]:
    if processed_df[column].dtype == object:
        processed_df[column] = processed_df[column].str.replace(",", ".").astype(float)

processed_df["DateTime"] = pd.to_datetime(processed_df["Date"].str.cat(processed_df["Time"], sep=" "), format="%d/%m/%Y %H.%M.%S")
processed_df.drop(["Date", "Time"], axis=1, inplace=True)

processed_df[processed_df.isnull().any(axis=1)]


Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,DateTime
9,0.6,1010.0,19.0,1.7,561.0,,1705.0,,1235.0,501.0,10.3,60.2,0.7517,2004-03-11 03:00:00
10,,1011.0,14.0,1.3,527.0,21.0,1818.0,34.0,1197.0,445.0,10.1,60.5,0.7465,2004-03-11 04:00:00
33,0.8,889.0,21.0,1.9,574.0,,1680.0,,1187.0,512.0,7.0,62.3,0.6261,2004-03-12 03:00:00
34,,831.0,10.0,1.1,506.0,21.0,1893.0,32.0,1134.0,384.0,6.1,65.9,0.6248,2004-03-12 04:00:00
39,,1545.0,,22.1,1353.0,,767.0,,2058.0,1588.0,9.2,56.2,0.6561,2004-03-12 09:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568,2005-04-04 10:00:00
9353,2.4,1163.0,,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119,2005-04-04 11:00:00
9354,2.4,1142.0,,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406,2005-04-04 12:00:00
9355,2.1,1003.0,,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139,2005-04-04 13:00:00


In [5]:
processed_df.isnull().sum() # Calculate the number of missing (NaN) values in each column

CO(GT)           1592
PT08.S1(CO)       366
NMHC(GT)         8443
C6H6(GT)            0
PT08.S2(NMHC)     366
NOx(GT)          1639
PT08.S3(NOx)      366
NO2(GT)          1642
PT08.S4(NO2)      366
PT08.S5(O3)       366
T                 366
RH                366
AH                366
DateTime            0
dtype: int64

In [6]:
# Delete column “Datetime”
processed_df.drop("DateTime", axis=1, inplace=True)

# Pop the target column"C6H6" to the last
tag_column = "C6H6(GT)"
temp = processed_df.pop(tag_column)
processed_df[tag_column] = temp

processed_df


Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,C6H6(GT)
0,2.6,1360.0,150.0,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,11.9
1,2.0,1292.0,112.0,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,9.4
2,2.2,1402.0,88.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,9.0
3,2.2,1376.0,80.0,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,9.2
4,1.6,1272.0,51.0,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,6.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568,13.5
9353,2.4,1163.0,,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119,11.4
9354,2.4,1142.0,,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406,12.4
9355,2.1,1003.0,,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139,9.5


In [7]:
# Set the proportion of valid, test and train
def data_split(df):

    train_df, valid_and_test_df = train_test_split(df, train_size=0.7, random_state=0)

    valid_df, test_df = train_test_split(valid_and_test_df, train_size=0.5, random_state=0)

    return train_df, valid_df, test_df

In [8]:
# Fill the missing values with mean and divide the data into validation, train and test
processed_df_a = processed_df.copy()
processed_df_a.fillna(processed_df_a.mean(), inplace=True)

train_df_a, valid_df_a, test_df_a = data_split(processed_df_a)

train_df_a

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,C6H6(GT)
8772,0.6,928.0,218.811816,631.0,148.0,985.0,89.0,908.0,884.0,4.5,56.3,0.4800,2.7
3594,1.2,929.0,218.811816,811.0,57.0,862.0,67.0,1541.0,642.0,35.5,29.8,1.7004,6.0
3197,3.5,1239.0,218.811816,1129.0,121.0,637.0,106.0,1839.0,1308.0,28.9,39.1,1.5370,14.4
2819,0.7,913.0,218.811816,743.0,65.0,931.0,66.0,1540.0,884.0,21.8,52.3,1.3539,4.6
9031,1.5,1069.0,218.811816,799.0,181.0,682.0,121.0,1211.0,1044.0,12.6,67.9,0.9886,5.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7891,2.6,1037.0,218.811816,981.0,388.0,721.0,139.0,1000.0,1086.0,13.7,20.7,0.3221,10.1
9225,0.5,826.0,218.811816,512.0,84.0,1108.0,61.0,1043.0,636.0,12.5,67.4,0.9722,1.1
4859,0.4,769.0,218.811816,586.0,60.0,1221.0,45.0,1134.0,650.0,17.1,52.5,1.0130,2.0
3264,1.3,1034.0,218.811816,812.0,70.0,866.0,82.0,1563.0,660.0,32.3,33.4,1.5900,6.0


In [9]:
# Define a function for data standardization and oversampling
def scale_dataset(scaler, classification_func=None):
    def func(df, oversample=False):

        data = scaler.transform(df)
        X = data[:, :-1]
        y = data[:, -1]

        if classification_func is not None:
            y = np.array([classification_func(item) for item in y])
            if oversample:
                ros = RandomOverSampler(random_state=0)
                X, y = ros.fit_resample(X, y)
        return data, X, y
    return func
# Define classification function
classification_func = lambda x: 1 if x > 0 else 0

In [10]:
# Standardized scaling, classification and oversampling of datasets
scaler_a = StandardScaler()
scaler_a.fit(train_df_a)
scale_classification_a = scale_dataset(scaler_a, classification_func)

train_a, train_X_a, train_y_a = scale_classification_a(train_df_a, oversample=True)
valid_a, valid_X_a, valid_y_a = scale_classification_a(valid_df_a)
test_a, test_X_a, test_y_a = scale_classification_a(test_df_a)


In [11]:
# Define a function to get the highest AUC value and the corresponding k value
def get_best_knn(train_X, train_y, valid_X, valid_y):
    best_knn = None
    print(f"Length of train data: {len(train_X)}")

    for k in range(1, int(math.sqrt(len(train_X))), 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(train_X, train_y)
        predicted_y = knn.predict(valid_X)
        auc = roc_auc_score(valid_y, predicted_y)
        print(f"Try k={k} on valid data, AUC: {auc}")

        if best_knn is None or auc > best_knn[0]:
            best_knn = (auc, k, knn)
    print(f"Best k: {best_knn[1]}, AUC on valid data: {best_knn[0]}")
    knn = best_knn[2]
    return knn


In [13]:
# Define a function to evaluate a model's performance
def evaluate_model(model, X, y):
    predicted_y = model.predict(X)
    auc = roc_auc_score(y, predicted_y)
    print(f"Model AUC: {auc}")

    print(classification_report(y, predicted_y))

    confusion_mat = confusion_matrix(y, predicted_y)
    print(f"Confusion matrix: \n{confusion_mat}")

In [12]:
print("Group A:")
knn_a = get_best_knn(train_X_a, train_y_a, valid_X_a, valid_y_a) # get best KNN for group A

Group A:
Length of train data: 11898
Try k=1 on valid data, AUC: 0.9580177265342775
Try k=3 on valid data, AUC: 0.9766559503163493
Try k=5 on valid data, AUC: 0.972782983623863
Try k=7 on valid data, AUC: 0.9744966857001844
Try k=9 on valid data, AUC: 0.9754357944380085
Try k=11 on valid data, AUC: 0.9742739044302626
Try k=13 on valid data, AUC: 0.9711755310762734
Try k=15 on valid data, AUC: 0.9652015656382169
Try k=17 on valid data, AUC: 0.9648142689689683
Try k=19 on valid data, AUC: 0.9676898610530356
Try k=21 on valid data, AUC: 0.9665279710452898
Try k=23 on valid data, AUC: 0.9657533777067925
Try k=25 on valid data, AUC: 0.9649787843682952
Try k=27 on valid data, AUC: 0.9642041910297978
Try k=29 on valid data, AUC: 0.9682416731216111
Try k=31 on valid data, AUC: 0.9630423010220519
Try k=33 on valid data, AUC: 0.961880411014306
Try k=35 on valid data, AUC: 0.9603312243373113
Try k=37 on valid data, AUC: 0.9591693343295654
Try k=39 on valid data, AUC: 0.9595566309988139
Try k=41 o

In [14]:
print("Evaluating KNN on train data:")
evaluate_model(knn_a, train_X_a, train_y_a)

print("Evaluating KNN on test data:")
evaluate_model(knn_a, test_X_a, test_y_a)

Evaluating KNN on train data:
Model AUC: 0.9948730879139351
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      5949
           1       1.00      0.99      0.99      5949

    accuracy                           0.99     11898
   macro avg       0.99      0.99      0.99     11898
weighted avg       0.99      0.99      0.99     11898

Confusion matrix: 
[[5949    0]
 [  61 5888]]
Evaluating KNN on test data:
Model AUC: 0.9656594460499731
              precision    recall  f1-score   support

           0       0.82      0.95      0.88       122
           1       1.00      0.98      0.99      1282

    accuracy                           0.98      1404
   macro avg       0.91      0.97      0.93      1404
weighted avg       0.98      0.98      0.98      1404

Confusion matrix: 
[[ 116    6]
 [  25 1257]]


In [17]:
# Create a Logistic Regression classifier and evaluate it
lr_a = LogisticRegression()
lr_a.fit(train_X_a, train_y_a)
predicted_y = lr_a.predict(valid_X_a)
auc = roc_auc_score(valid_y_a, predicted_y)
print(f"AUC on valid data: {auc}")

print("Evaluating logistic regression on train data:")
evaluate_model(lr_a, train_X_a, train_y_a)

print("Evaluating logistic regression on test data:")
evaluate_model(lr_a, test_X_a, test_y_a)


AUC on valid data: 0.7454329839666033
Evaluating logistic regression on train data:
Model AUC: 0.7465120188266937
              precision    recall  f1-score   support

           0       0.78      0.68      0.73      5949
           1       0.72      0.81      0.76      5949

    accuracy                           0.75     11898
   macro avg       0.75      0.75      0.75     11898
weighted avg       0.75      0.75      0.75     11898

Confusion matrix: 
[[4051 1898]
 [1118 4831]]
Evaluating logistic regression on test data:
Model AUC: 0.6923096595994988
              precision    recall  f1-score   support

           0       0.22      0.58      0.32       122
           1       0.95      0.80      0.87      1282

    accuracy                           0.78      1404
   macro avg       0.59      0.69      0.59      1404
weighted avg       0.89      0.78      0.82      1404

Confusion matrix: 
[[  71   51]
 [ 253 1029]]


In [19]:
processed_df_b = processed_df.copy()  # Create dataframe copy for group B: delete rows and columns with majority of null values
# Drop rows with at least 9 non-null values and the column of "NMHC(GT)"
processed_df_b.dropna(axis=0, thresh=9, inplace=True)
processed_df_b.drop("NMHC(GT)", axis=1, inplace=True)
# Fill the missing values with mean
processed_df_b.fillna(processed_df_b.mean(), inplace=True)
# Divide the data into validation, train and test
train_df_b, valid_df_b, test_df_b = data_split(processed_df_b)

train_df_b

Unnamed: 0,CO(GT),PT08.S1(CO),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,C6H6(GT)
1020,-0.31738,977.0,800.0,242.189292,990.0,112.145137,1474.0,926.0,12.2,64.1,0.9062,5.7
3133,2.30000,1279.0,1197.0,250.000000,599.0,112.000000,1906.0,1518.0,22.9,43.0,1.1826,16.6
1163,0.50000,856.0,619.0,29.000000,1191.0,39.000000,1345.0,677.0,16.3,52.8,0.9722,2.5
3858,-0.31738,1029.0,791.0,242.189292,855.0,112.145137,1567.0,651.0,33.3,33.1,1.6642,5.6
6532,2.40000,1195.0,1013.0,355.000000,646.0,104.000000,1345.0,1276.0,14.5,62.3,1.0216,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4520,-0.31738,908.0,657.0,242.189292,1013.0,112.145137,1460.0,796.0,20.5,78.2,1.8678,3.1
8256,2.40000,1172.0,977.0,280.000000,657.0,177.000000,1160.0,1047.0,8.4,45.2,0.4977,10.0
5007,-0.31738,1337.0,1339.0,242.189292,525.0,112.145137,1870.0,1622.0,20.6,58.5,1.4054,21.6
3343,-0.31738,860.0,847.0,58.000000,873.0,77.000000,1426.0,748.0,23.6,31.9,0.9195,6.8


In [22]:
# Standardized scaling, classification and oversampling of datasets
scaler_b = StandardScaler()

scaler_b.fit(train_df_b)


scale_classification_b = scale_dataset(scaler_b, classification_func)

train_b, train_X_b, train_y_b = scale_classification_b(train_df_b, oversample=True)

valid_b, valid_X_b, valid_y_b = scale_classification_b(valid_df_b)

test_b, test_X_b, test_y_b = scale_classification_b(test_df_b)


In [23]:
print("Group B:")
knn_b = get_best_knn(train_X_b, train_y_b, valid_X_b, valid_y_b) # get best KNN for group B

Group B:
Length of train data: 7530
Try k=1 on valid data, AUC: 0.9601114471326164
Try k=3 on valid data, AUC: 0.9590205439814815
Try k=5 on valid data, AUC: 0.9609736689814816
Try k=7 on valid data, AUC: 0.9682459677419355
Try k=9 on valid data, AUC: 0.9633631552419355
Try k=11 on valid data, AUC: 0.9596179155465949
Try k=13 on valid data, AUC: 0.9631449746117086
Try k=15 on valid data, AUC: 0.9674875858721624
Try k=17 on valid data, AUC: 0.966131832437276
Try k=19 on valid data, AUC: 0.961789221176822
Try k=21 on valid data, AUC: 0.9588595336768219
Try k=23 on valid data, AUC: 0.9610308393070489
Try k=25 on valid data, AUC: 0.958641353046595
Try k=27 on valid data, AUC: 0.9570674189814816
Try k=29 on valid data, AUC: 0.9551142939814815
Try k=31 on valid data, AUC: 0.9547351030465949
Try k=33 on valid data, AUC: 0.953540359916368
Try k=35 on valid data, AUC: 0.954298741786141
Try k=37 on valid data, AUC: 0.9556544952210274
Try k=39 on valid data, AUC: 0.955275304286141
Try k=41 on val

In [24]:
print("Evaluating KNN on train data:")
evaluate_model(knn_b, train_X_b, train_y_b)

print("Evaluating KNN on test data:")
evaluate_model(knn_b, test_X_b, test_y_b)

Evaluating KNN on train data:
Model AUC: 0.9784860557768924
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3765
           1       0.98      0.98      0.98      3765

    accuracy                           0.98      7530
   macro avg       0.98      0.98      0.98      7530
weighted avg       0.98      0.98      0.98      7530

Confusion matrix: 
[[3673   92]
 [  70 3695]]
Evaluating KNN on test data:
Model AUC: 0.9622409359317073
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       822
           1       0.94      0.97      0.95       527

    accuracy                           0.96      1349
   macro avg       0.96      0.96      0.96      1349
weighted avg       0.96      0.96      0.96      1349

Confusion matrix: 
[[788  34]
 [ 18 509]]


In [25]:
# Create a Logistic Regression classifier for group B
lr_b = LogisticRegression()
lr_b.fit(train_X_b, train_y_b)
predicted_y = lr_b.predict(valid_X_b)
auc = roc_auc_score(valid_y_b, predicted_y)
print(f"AUC on valid data: {auc}")

print("Evaluating logistic regression on train data:")
evaluate_model(lr_b, train_X_b, train_y_b)

print("Evaluating logistic regression on test data:")
evaluate_model(lr_b, test_X_b, test_y_b)


AUC on valid data: 0.9914757877837516
Evaluating logistic regression on train data:
Model AUC: 0.9911022576361221
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3765
           1       0.99      0.99      0.99      3765

    accuracy                           0.99      7530
   macro avg       0.99      0.99      0.99      7530
weighted avg       0.99      0.99      0.99      7530

Confusion matrix: 
[[3719   46]
 [  21 3744]]
Evaluating logistic regression on test data:
Model AUC: 0.9920197417323415
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       822
           1       0.98      1.00      0.99       527

    accuracy                           0.99      1349
   macro avg       0.99      0.99      0.99      1349
weighted avg       0.99      0.99      0.99      1349

Confusion matrix: 
[[812  10]
 [  2 525]]
