In [None]:
import pandas as pd

# loading the dataset
x_train = pd.read_csv('CompleteDataset/x_train_all.csv')
y_train = pd.read_csv('CompleteDataset/y_train_all.csv')
x_test = pd.read_csv('CompleteDataset/x_test_all.csv')
y_test = pd.read_csv('CompleteDataset/y_test_all.csv')

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    accuracy_score,
    f1_score,
    confusion_matrix,
    precision_score,
    recall_score,
    roc_curve,
    roc_auc_score,
    auc
)

def get_metrics(clf, x_train, y_train, x_test, y_test):
    # transform pandas dataset to a 1d numpy array
    y_train = y_train.to_numpy().ravel()
    y_test = y_test.to_numpy().ravel()

    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    y_pred_probs = clf.predict_proba(x_test)
    # finding the accuracy
    accuracy = accuracy_score(y_test,y_pred )
    # finding the f1
    f1 = f1_score(y_pred, y_test, average="weighted")
    # finding the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    # finding the precision
    precision = precision_score(y_test, y_pred, average="weighted",zero_division=1)
    # finding the recall
    recall = recall_score(y_test, y_pred, average="weighted")
    # finding the roc
    Y_test_bin = label_binarize(y_test, classes=[0, 1, 2,3,4,5,6,7,8,9])
    roc = roc_auc_score(Y_test_bin, y_pred_probs, multi_class="ovr", average="weighted")

    fpr, tpr, _ = roc_curve(Y_test_bin.ravel(), y_pred_probs.ravel())
    # plotiing the AUC graph
    auc_val = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='AUC (area = %0.2f)' % auc_val)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
    # plotting the confusion matrix 
    out=ConfusionMatrixDisplay(conf_matrix,display_y_train=clf.classes_)
    out.plot()
    plt.show()
    print("Accuracy:", accuracy)
    print("F1 Score:", f1)
    print("Precision:", precision)
    print("Recall/Sensitivity/True Positive Rate:", recall)
    print("False Positive Rate:", fpr)
    print("Area under ROC curve:", roc)
    print("Confusion Matrix:\n", conf_matrix)

In [None]:
import numpy as np
import seaborn as sns
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import(
    StratifiedKFold,
    cross_val_score,
    cross_val_predict
)

def get_cross_val_metrics(clf, x_train, y_train):
    # transform pandas dataset to a 1d numpy array
    y_train = y_train.to_numpy().ravel()
    y_test = y_test.to_numpy().ravel()

    # Perform 10-fold cross-validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_scores = cross_val_score(clf, x_train, y_train, cv=cv, scoring='accuracy')

    # Print the cross-validation accuracy scores
    print("Cross-validation accuracy scores:", cv_scores)
    print("Mean accuracy:", cv_scores.mean())
    print("Accuracy standard deviation:", cv_scores.std())

    # Plotting the cross-validation scores
    plt.figure(figsize=(8, 6))
    plt.bar(range(len(cv_scores)), cv_scores, color='lightgreen')
    plt.xlabel('Fold')
    plt.ylabel('Accuracy')
    plt.title('Cross-Validation Scores')
    plt.ylim(0, 1)  # Set the y-axis limits if needed
    plt.show()

    # Get predicted y_train for each fold
    y_pred = cross_val_predict(clf, x_train, y_train, cv=cv)

    # Compute overall precision, recall, F1 score, and support (unused)
    precision, recall, f1, _ = precision_recall_fscore_support(y_train, y_pred, average='macro')

    # Print overall metrics
    print("Overall Precision:", precision)
    print("Overall Recall:", recall)
    print("Overall F1 Score:", f1)

    # Initialize variables to store overall metrics and confusion matrix
    overall_conf_matrix = np.zeros((len(np.unique(y_train)), len(np.unique(y_train))))

    # Loop over each fold
    for i, (train_idx, test_idx) in enumerate(cv.split(x_train, y_train)):
        X_train, X_test = x_train.iloc[train_idx], x_train.iloc[test_idx]
        y_train, y_test = y_train.iloc[train_idx], y_train.iloc[test_idx]

        # Fit the classifier
        clf.fit(X_train, y_train)

        # Predict on the test set
        y_fold_pred = clf.predict(X_test)

        # Compute and plot confusion matrix
        conf_matrix = confusion_matrix(y_test, y_fold_pred)
        overall_conf_matrix += conf_matrix

    # Calculate and print average confusion matrix
    y_train_annot = list(np.unique(np.array(y_train)))
    avg_conf_matrix = overall_conf_matrix / 10
    plt.figure(figsize=(8, 6))
    sns.heatmap(avg_conf_matrix, annot=True, fmt=".2f", xticky_train=y_train_annot, yticky_train=y_train_annot)
    plt.title("Average Confusion Matrix")
    plt.xlabel("Predicted y_train")
    plt.ylabel("Actual y_train")
    plt.show()

    X = x_train.values  # Convert DataFrame to NumPy array
    y = y_train.values  # Convert DataFrame to NumPy array

    y_scores = np.zeros((len(y), len(np.unique(y))))

    for train, test in cv.split(X, y):
        clf.fit(X[train], y[train])
        y_scores[test] = clf.predict_proba(X[test])

    # Compute overall ROC-AUC score
    roc_auc = roc_auc_score(label_binarize(y, classes=np.unique(y)), y_scores, average='macro')

    # Print overall ROC-AUC score
    print("Overall ROC-AUC:", roc_auc)

In [None]:
from sklearn.linear_model import LogisticRegression

# despite the name, logisticRegression is implemented as a linear model for classsification, as specified in the coursework specs
# used "newton-cholesky" solver because it converges faster
linear_clf = LogisticRegression(random_state=0, solver="newton-cholesky")

In [None]:
# training set with 10-fold cross-valiation
get_cross_val_metrics(linear_clf, x_train, y_train)

In [None]:
# training set without 10-fold cross-valiation
get_metrics(linear_clf, x_train, y_train, x_train, y_train)

In [None]:
# testing set
get_metrics(linear_clf, x_train, y_train, x_test, y_test)

In [None]:
from sklearn.neural_network import MLPClassifier

# experiment by changing:
# act func,
# no of layers,
# size of layers,
# learning rate,
# epochs,
# momentum,
# validation threshold

mlp_clf = MLPClassifier(random_state=42)

In [None]:
mlp_clf.fit(x_train, y_train)
y_pred = mlp_clf.predict(x_test)
print(accuracy_score(y_test, y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

def grid_search(clf, x_train, y_train):
    # transform pandas dataset to a 1d numpy array
    y_train = y_train.to_numpy().ravel()
    # y_test = y_test.to_numpy().ravel()

    # dictionary of parameters to be varied (keys), and arrays of possible values (values)
    param_grid = {
        # number of layers and their sizes
        "hidden_layer_sizes": [(10,30,10),(20,)],
        # activation functions
        "activation": ["tanh", "relu"],
        # solver
        "solver": ["sgd", "adam"],
        "alpha": [0.0001, 0.05],
        "learning_rate": ["constant","adaptive"],
        "n_iter_no_change": [10],
        "max_iter": [200],
        "momentum": [0.9]
    }