In [1]:
# import the required machine learning libraries and models
import numpy as np
import pandas as pd
import sys
import math
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from imblearn.over_sampling import RandomOverSampler

In [3]:
# Read the file and delete blank values according to the prompts
data_filename = 'AirQualityUCI.csv'

if sys.modules.get("google.colab") is None:
    data_path_prefix = "."
else:
    from google.colab import drive
    drive.mount("/content/drive")
    data_path_prefix = "/content/drive/MyDrive/MachineLearningAssignments/Assignment1"

data_path = f"{data_path_prefix}/{data_filename}"

print(f"Loading data from data path: {data_path}")

df = pd.read_csv(data_path, sep=';', na_values=-200)

df


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading data from data path: /content/drive/MyDrive/MachineLearningAssignment/Assignment1/AirQualityUCI.csv


FileNotFoundError: ignored

In [5]:
!ls /content/drive/MyDrive/MachineLearningAssignment/

ls: cannot access '/content/drive/MyDrive/MachineLearningAssignment/': No such file or directory


In [None]:
# Show rows with missing values
df[df.loc[:,~df.columns.str.contains("Unnamed:")].isnull().any(axis=1)]

In [None]:
# The first step of preprocessing the data
processed_df = df.copy()

processed_df.dropna(how='all', axis=1, inplace=True)
processed_df.dropna(how='all', axis=0, inplace=True)

for column in processed_df.columns[2:]:
    if processed_df[column].dtype == object:
        processed_df[column] = processed_df[column].str.replace(",", ".").astype(float)

processed_df["DateTime"] = pd.to_datetime(processed_df["Date"].str.cat(processed_df["Time"], sep=" "), format="%d/%m/%Y %H.%M.%S")
processed_df.drop(["Date", "Time"], axis=1, inplace=True)

processed_df[processed_df.isnull().any(axis=1)]


In [None]:
processed_df.isnull().sum() # Calculate the number of missing (NaN) values in each column

In [None]:
# Delete column “Datetime”
processed_df.drop("DateTime", axis=1, inplace=True)

# Pop the target column"C6H6" to the last
tag_column = "C6H6(GT)"
temp = processed_df.pop(tag_column)
processed_df[tag_column] = temp

processed_df


In [None]:
# Set the proportion of valid, test and train
def data_split(df):

    train_df, valid_and_test_df = train_test_split(df, train_size=0.7, random_state=0)

    valid_df, test_df = train_test_split(valid_and_test_df, train_size=0.5, random_state=0)

    return train_df, valid_df, test_df

In [None]:
# Prepare data for group A:
# Fill the missing values with mean and divide the data into validation, train and test
processed_df_a = processed_df.copy()
processed_df_a.fillna(processed_df_a.mean(), inplace=True)

train_df_a, valid_df_a, test_df_a = data_split(processed_df_a)

train_df_a

In [None]:
# The function returns a function for data standardization, oversampling and classification
def scale_dataset(scaler, classification_func=None):
    def func(df, oversample=False):

        data = scaler.transform(df)
        X = data[:, :-1]
        y = data[:, -1]

        if classification_func is not None:
            y = np.array([classification_func(item) for item in y])
            if oversample:
                ros = RandomOverSampler(random_state=0)
                X, y = ros.fit_resample(X, y)
        return data, X, y
    return func
# Define classification function: after standardization, target column value > 0 --> 1, if value <= 0 --> 0
classification_func = lambda x: 1 if x > 0 else 0

In [None]:
# Standardized scaling, classification and oversampling of datasets
scaler_a = StandardScaler()
scaler_a.fit(train_df_a)
scale_classification_a = scale_dataset(scaler_a, classification_func)

train_a, train_X_a, train_y_a = scale_classification_a(train_df_a, oversample=True)
valid_a, valid_X_a, valid_y_a = scale_classification_a(valid_df_a)
test_a, test_X_a, test_y_a = scale_classification_a(test_df_a)


In [None]:
# Define a function to get the highest AUC value and the corresponding k value
def get_best_knn(train_X, train_y, valid_X, valid_y):
    best_knn = None
    print(f"Length of train data: {len(train_X)}")

    for k in range(1, int(math.sqrt(len(train_X))), 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(train_X, train_y)
        predicted_y = knn.predict(valid_X)
        auc = roc_auc_score(valid_y, predicted_y)
        print(f"Try k={k} on valid data, AUC: {auc}")

        if best_knn is None or auc > best_knn[0]:
            best_knn = (auc, k, knn)
    print(f"Best k: {best_knn[1]}, AUC on valid data: {best_knn[0]}")
    knn = best_knn[2]
    return knn


In [None]:
# Define a function to evaluate a model's performance
def evaluate_model(model, X, y):
    predicted_y = model.predict(X)
    auc = roc_auc_score(y, predicted_y)
    print(f"Model AUC: {auc}")

    print(classification_report(y, predicted_y))

    confusion_mat = confusion_matrix(y, predicted_y)
    print(f"Confusion matrix: \n{confusion_mat}")

In [None]:
print("Group A:")
knn_a = get_best_knn(train_X_a, train_y_a, valid_X_a, valid_y_a) # get best KNN for group A

In [None]:
print("Evaluating KNN on train data:")
evaluate_model(knn_a, train_X_a, train_y_a)

print("Evaluating KNN on test data:")
evaluate_model(knn_a, test_X_a, test_y_a)

In [None]:
# Create a Logistic Regression classifier and evaluate it
lr_a = LogisticRegression()
lr_a.fit(train_X_a, train_y_a)
predicted_y = lr_a.predict(valid_X_a)
auc = roc_auc_score(valid_y_a, predicted_y)
print(f"AUC on valid data: {auc}")

print("Evaluating logistic regression on train data:")
evaluate_model(lr_a, train_X_a, train_y_a)

print("Evaluating logistic regression on test data:")
evaluate_model(lr_a, test_X_a, test_y_a)


In [None]:
# Prepare data for group B:
# Delete rows with less than 9 non-null values, and delte the column "NMHC(GT)" as it is majority null
# Fill the missing values with mean and divide the data into validation, train and test
processed_df_b = processed_df.copy()
processed_df_b.dropna(axis=0, thresh=9, inplace=True)
processed_df_b.drop("NMHC(GT)", axis=1, inplace=True)

processed_df_b.fillna(processed_df_b.mean(), inplace=True)
train_df_b, valid_df_b, test_df_b = data_split(processed_df_b)

train_df_b

In [None]:
# Standardized scaling, classification and oversampling of datasets
scaler_b = StandardScaler()

scaler_b.fit(train_df_b)


scale_classification_b = scale_dataset(scaler_b, classification_func)

train_b, train_X_b, train_y_b = scale_classification_b(train_df_b, oversample=True)

valid_b, valid_X_b, valid_y_b = scale_classification_b(valid_df_b)

test_b, test_X_b, test_y_b = scale_classification_b(test_df_b)


In [None]:
print("Group B:")
knn_b = get_best_knn(train_X_b, train_y_b, valid_X_b, valid_y_b) # get best KNN for group B

In [None]:
print("Evaluating KNN on train data:")
evaluate_model(knn_b, train_X_b, train_y_b)

print("Evaluating KNN on test data:")
evaluate_model(knn_b, test_X_b, test_y_b)

In [None]:
# Create a Logistic Regression classifier for group B
lr_b = LogisticRegression()
lr_b.fit(train_X_b, train_y_b)
predicted_y = lr_b.predict(valid_X_b)
auc = roc_auc_score(valid_y_b, predicted_y)
print(f"AUC on valid data: {auc}")

print("Evaluating logistic regression on train data:")
evaluate_model(lr_b, train_X_b, train_y_b)

print("Evaluating logistic regression on test data:")
evaluate_model(lr_b, test_X_b, test_y_b)
