In [53]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor

# Load Datasets

In [54]:
def load_data(file_path):
    A = np.loadtxt(file_path)
    X = A[:, :9]    # Input features
    y = A[:, 9:]    # Output labels
    return X, y

In [55]:
# Load all 3 datasets
X_final, y_final = load_data('data/tictac_final.txt')
y_final = y_final[:, 0]

X_single, y_single = load_data('data/tictac_single.txt')
y_single = y_single[:, 0]

X_multi, y_multi = load_data('data/tictac_multi.txt')

## KNN Classifier

In [56]:
def find_optimal_clf_k(X_train, y_train):
    model = KNeighborsClassifier()

    # Define the parameter grid
    param_grid = {'n_neighbors': np.arange(1, 10)}

    # GridSearchCV to find the best value of k
    grid_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=9, cv=10, random_state=42)
    grid_search.fit(X_train, y_train)

    # Get the best value of k
    optimal_k = grid_search.best_params_['n_neighbors']
    print("Optimal k:", optimal_k)
        
    return optimal_k

In [65]:
def knn_clf_train(X, y, is_one_tenth = False):
    # Split into training and testing data
    if is_one_tenth == False:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, shuffle=True, random_state=42)

    # Find the optimal k value
    optimal_k = find_optimal_clf_k(X_train, y_train)

    # Define and train model
    model = KNeighborsClassifier(n_neighbors=optimal_k)
    model.fit(X_train, y_train)

    # Get cross validation accuracy
    val_accuracy = cross_val_score(model, X_train, y_train, cv=10, scoring="accuracy")
    val_accuracy = np.mean(val_accuracy)

    # Get test accuracy
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)

    # Get confusion matrix
    confusion_mtrx = confusion_matrix(y_test, y_pred, normalize="true")

    return val_accuracy, test_accuracy, confusion_mtrx

In [58]:
def print_clf_results(val_accuracy, test_accuracy, confusion_mtrx, dataset_name):
    print(f"Performance of KNN Classification on {dataset_name}:")
    print("Cross Validation Accuracy = ", val_accuracy)
    print("Test Accuracy = ", test_accuracy)
    print("Confusion Matrix:")
    print(confusion_mtrx.round(decimals=3)) # Round to 3 decimal places

## Final Dataset

In [66]:
val_acc_final, test_acc_final, confusion_mtrx_final = knn_clf_train(X_final, y_final)
print_clf_results(val_acc_final, test_acc_final, confusion_mtrx_final, "Final Dataset")

print("Extra Credit #2 - Train the models on 1/10th of the data")
val_acc_final, test_acc_final, confusion_mtrx_final = knn_clf_train(X_final, y_final, True)
print_clf_results(val_acc_final, test_acc_final, confusion_mtrx_final, "Final Dataset")

Optimal k: 1
Performance of KNN Classification on Final Dataset:
Cross Validation Accuracy =  1.0
Test Accuracy =  1.0
Confusion Matrix:
[[1. 0.]
 [0. 1.]]
Extra Credit #2 - Train the models on 1/10th of the data
Optimal k: 4
Performance of KNN Classification on Final Dataset:
Cross Validation Accuracy =  0.8622222222222223
Test Accuracy =  0.8586326767091541
Confusion Matrix:
[[0.753 0.247]
 [0.084 0.916]]


## Single Dataset

In [67]:
val_acc_single, test_acc_single, confusion_mtrx_single = knn_clf_train(X_single, y_single)
print_clf_results(val_acc_single, test_acc_single, confusion_mtrx_single, "Single Dataset")

print("Extra Credit #2 - Train the models on 1/10th of the data")
val_acc_single, test_acc_single, confusion_mtrx_single = knn_clf_train(X_single, y_single, True)
print_clf_results(val_acc_single, test_acc_single, confusion_mtrx_single, "Single Dataset")

Optimal k: 1
Performance of KNN Classification on Single Dataset:
Cross Validation Accuracy =  0.8269083969465649
Test Accuracy =  0.8573607932875668
Confusion Matrix:
[[0.898 0.003 0.028 0.    0.04  0.009 0.012 0.006 0.003]
 [0.024 0.857 0.018 0.018 0.036 0.    0.012 0.    0.036]
 [0.08  0.016 0.856 0.011 0.011 0.005 0.005 0.005 0.011]
 [0.043 0.068 0.034 0.769 0.026 0.    0.009 0.017 0.034]
 [0.054 0.04  0.02  0.    0.861 0.    0.01  0.    0.015]
 [0.053 0.039 0.039 0.    0.    0.855 0.    0.    0.013]
 [0.02  0.03  0.    0.    0.02  0.    0.909 0.01  0.01 ]
 [0.02  0.1   0.    0.06  0.06  0.    0.02  0.68  0.06 ]
 [0.022 0.056 0.011 0.011 0.011 0.    0.    0.022 0.865]]
Extra Credit #2 - Train the models on 1/10th of the data
Optimal k: 5
Performance of KNN Classification on Single Dataset:
Cross Validation Accuracy =  0.5556643356643356
Test Accuracy =  0.5532564450474898
Confusion Matrix:
[[0.797 0.056 0.03  0.01  0.043 0.005 0.026 0.023 0.011]
 [0.105 0.65  0.068 0.021 0.068 0.01

# KNN Regressor

In [61]:
def find_optimal_reg_k(X_train, y_train):
    knn_regressor = KNeighborsRegressor()
    multioutput_regressor = MultiOutputRegressor(knn_regressor)

    # Define the parameter grid
    param_grid = {'estimator__n_neighbors': np.arange(1, 10)}

    # GridSearchCV to find the best value of k
    grid_search = RandomizedSearchCV(multioutput_regressor, param_grid, n_iter=9, cv=10, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    # Get the best value of k
    optimal_k = grid_search.best_params_['estimator__n_neighbors']
    print("Optimal k:", optimal_k)
        
    return optimal_k

In [68]:
def knn_reg_train(X, y, is_one_tenth = False):
    # Split into training and testing data
    if is_one_tenth == False:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, shuffle=True, random_state=42)

    # Find the optimal k value
    optimal_k = find_optimal_reg_k(X_train, y_train)

    # Define and train model
    base_regressor = KNeighborsRegressor(n_neighbors=optimal_k)
    model = MultiOutputRegressor(base_regressor)
    model.fit(X_train, y_train)

    # Get cross validation accuracy
    val_rmse = cross_val_score(model, X_train, y_train, cv=10, scoring="neg_mean_squared_error")
    val_rmse = np.mean(val_rmse * -1)  # Convert to positive

    # Get test accuracy
    test_accuracy = model.score(X_test, y_test)

    # Get RMSE
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return val_rmse, test_accuracy, rmse

In [63]:
def print_reg_results(val_rmse, test_accuracy, rmse, dataset_name):
    print(f"Performance of KNN Regression on {dataset_name}:")
    print("Cross Validation RMSE = ", val_rmse)
    print("Test Accuracy = ", test_accuracy)
    print("RMSE = ", rmse)

## Multi Dataset

In [69]:
val_rmse_multi, test_acc_multi, rmse_multi = knn_reg_train(X_multi, y_multi)
print_reg_results(val_rmse_multi, test_acc_multi, rmse_multi, "Multi Dataset")

print("Extra Credit #2 - Train the models on 1/10th of the data")
val_rmse_multi, test_acc_multi, rmse_multi = knn_reg_train(X_multi, y_multi, True)
print_reg_results(val_rmse_multi, test_acc_multi, rmse_multi, "Multi Dataset")

Optimal k: 1
Performance of KNN Regression on Multi Dataset:
Cross Validation RMSE =  0.0693596268023749
Test Accuracy =  0.6328073940260437
RMSE =  0.2458228018360667
Extra Credit #2 - Train the models on 1/10th of the data
Optimal k: 7
Performance of KNN Regression on Multi Dataset:
Cross Validation RMSE =  0.11011459440030869
Test Accuracy =  0.3529073027612973
RMSE =  0.3281922228849391
