In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# All skl imports go here
from sklearn import tree   # Decision Trees
from sklearn import svm    # svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn import metrics
import sklearn as skl

# Data Loading

In [2]:
NUM_CLASSES = 6
CLASSES = ["sadnesss", "joy", "love", "anger", "fear"]

In [3]:
# Load all data
train_data = pd.read_csv("data/training_labse.csv")
test_data = pd.read_csv("data/test_labse.csv")
validation_data = pd.read_csv("data/validation_labse.csv")

# Separate X's and y's from each other
FEATURE_COLUMNS = [x for x in train_data if x.startswith("_e")]
LABEL_COLUMN = "label"

X_train = train_data[FEATURE_COLUMNS]
Y_train = train_data[LABEL_COLUMN]

X_test = test_data[FEATURE_COLUMNS]
Y_test = test_data[LABEL_COLUMN]

X_val = validation_data[FEATURE_COLUMNS]
Y_val = validation_data[LABEL_COLUMN]

# These are used to run cross validation
X_train_val = pd.concat([X_train, X_val]) 
Y_train_val = pd.concat([Y_train, Y_test])

# These are used to run val and test for Neural Nets
X_val_test = pd.concat([X_val, X_test])
Y_val_test = pd.concat([Y_val, Y_test])

In [4]:
# Perform pre-processing PCA on the training set
def perform_pca(dataset, target_variance):
    pca = PCA(n_components= target_variance)

    # Need to standardize the data frirst
    standardized = (dataset - dataset.mean(axis=0)) / dataset.std(axis = 0)

    pca.fit(X=standardized)
    dataset_reduced = pca.fit_transform(X=standardized)

    return pca, dataset_reduced

In [5]:
TARGET_EXPLAINED_VARIANCE = 0.999999

pca_train, X_train_reduced = perform_pca(X_train, TARGET_EXPLAINED_VARIANCE)
X_val_reduced = pca_train.transform(X_val)
X_test_redced = pca_train.transform(X_test)
X_train_val_reduced = pca_train.transform(X_train_val)
X_val_test_reduced = pca_train.transform(X_val_test)

print(f"{pca_train.n_components_} components for training")

768 components for training


In [6]:
print(f"{pca_train.singular_values_}")

[814.69738354 546.67104552 528.87107772 502.73663688 482.85581159
 466.81679316 442.68152216 430.42286856 422.01865155 411.74269209
 409.00392128 403.11215021 394.65041765 391.32999999 389.2364008
 385.37578798 367.76396378 362.41597822 361.82567218 358.29425668
 355.38206026 353.86319187 348.04244124 345.94840278 340.53904307
 338.4970436  332.97550988 330.00885012 325.1969818  322.61594639
 319.9033563  317.19004342 310.80980127 310.52524287 308.87651055
 304.5386695  302.95481421 298.49463695 296.36065653 294.7988591
 292.43711462 291.62820072 288.36843427 287.10120672 285.6328987
 281.21760348 277.99769435 276.41506484 275.11401628 273.96804488
 270.88290065 268.23564673 265.77070274 265.22002507 263.61160194
 261.06462165 259.77284801 258.86329056 255.94540799 254.62433271
 253.8495155  251.35305198 248.41280944 247.29636352 247.25089942
 244.54703063 242.8967489  241.01063318 240.20377196 239.08196251
 237.54106365 236.08777325 234.61402917 232.57704573 231.81800438
 230.92240035

In [7]:
X_train_unreduced = X_train.to_numpy()
X_val_unreduced = X_val.to_numpy()
X_test_unreduced = X_test.to_numpy()
X_train_val_unreduced = X_train_val.to_numpy()
X_val_test_unreduced = X_val_test.to_numpy()


# K-Folds Cross Validation

In [8]:

def k_folds_x_val(model, X, y, k =  5): 
    results = cross_validate(model, X, y, cv=k, scoring=["accuracy"])
    mean_accuracy = results['test_accuracy'].mean()
    print(f"Mean Accuracy: {mean_accuracy}")
    return mean_accuracy, 

def get_cmat(model, X, y):
    y_pred = model.predict(X)
    return confusion_matrix(y, y_pred, normalize="pred")

def get_metrics(model, X, y):
    y_pred = model.predict(X)
    return accuracy_score(y, y_pred), f1_score(y, y_pred, average="weighted")

# Grid Search Hyperparameter Optimization

In [9]:
def gridSearchHPO(model, search_space):
    grid_search = GridSearchCV(estimator=model,
                                param_grid=search_space,
                                scoring='accuracy',
                                cv=5,
                                verbose=3,
                                error_score='raise',
                                n_jobs=-1,  # -1 means max amount
                                )
    return grid_search

# Decision Trees

In [None]:
decision_tree_model = tree.DecisionTreeClassifier(
    criterion ='entropy', 
    splitter = 'best',
    max_depth = 256, 
    max_features = 'sqrt',
    max_leaf_nodes = 80,
)

In [None]:
# ['ccp_alpha', 'class_weight', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 
# 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'random_state', 'splitter']
dt_search_space = {
    # 'ccp_alpha':[0.1, 0.2, 0.4, 0.5],
    'max_leaf_nodes':[71, 72, 73, 74, 75, 76, 77, 78, 79, 80], 
    # 'min_impurity_decrease':[1.0, 0.5, 1.5, 2.0], # float
    # 'min_weight_fraction_leaf':[0.1, 0.2, 0.4, 0.5],
    'max_depth':[255, 256, 257, 258, 259, 260], 
    # 'max_features':["log2", "sqrt"], 
    # 'min_samples_leaf':[0.1, 0.2, 0.4, 0.5], 
    # 'min_samples_split':[0.1, 0.2, 0.4, 0.5]
    }

model_dt = decision_tree_model.fit(X_train, Y_train)

gridsearch_dt = gridSearchHPO(model=model_dt, search_space=dt_search_space)

In [None]:
gridsearch_dt.fit(X_train, Y_train)

In [None]:
print("Best Score: {}".format(gridsearch_dt.best_score_))
print("Best params: {}".format(gridsearch_dt.best_params_))

# K Nearest Neighbors

In [42]:
k_nearest_model = KNeighborsClassifier(
    n_jobs=-1,
    weights="uniform",
    algorithm="ball_tree",
    metric="cityblock",
    leaf_size=2,
    n_neighbors=60
)

In [49]:
knn_search_space = {
    "n_neighbors": [60],
    # "weights": ['uniform', 'distance'], # Uniform
    # "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'], # ball tree
    # "metric": ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan', 'minkowski'], # city block
    # "leaf_size": [2], # 2
}

In [50]:
model_knn = k_nearest_model.fit(X_train, Y_train)

gridsearch_knn = gridSearchHPO(model=model_knn, search_space=knn_search_space)

In [51]:
gridsearch_knn.fit(X_train, Y_train)

print("Best Score: {}".format(gridsearch_knn.best_score_))
print("Best params: {}".format(gridsearch_knn.best_params_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Score: 0.5619375
Best params: {'n_neighbors': 60}


# SVMs

In [None]:
svm_classifier = svm.SVC(max_iter=-1)

#Train the model using the training sets
svm_classifier.fit(X_train, Y_train)

#Predict the response for test dataset
# k_folds_x_val(svm_classifier)

In [None]:
svm_search_space={
    'C': [1],
    'kernel': ['poly'], # poly - 'rbf', 'linear', 'sigmoid' 
    'degree': [3],  # 3
    'gamma': ['scale'], #'auto',  1, 0.1, 0.01, 0.001, 0.0001, 
    'coef0': [0.1], 
    'shrinking': [True], 
    'probability': [False], 
    'tol': [0.1],  
    'class_weight': [None], 
    'decision_function_shape': ['ovr'], # 'ovo'
}

gridsearch_svm = gridSearchHPO(svm_classifier, svm_search_space)

In [None]:
gridsearch_svm.fit(X_train, Y_train)

In [None]:
print("Best Score: {}".format(gridsearch_svm.best_score_))
print("Best params: {}".format(gridsearch_svm.best_params_))

# Pytorch Setup

In [None]:
# Pytorch specific constants
from torch.utils.data import TensorDataset, DataLoader
from utils.neuralnet import NeuralNetwork
from utils.trainer import training_loop, evaluate
import torch

BATCH_SIZE = 16
LEARNING_RATE = 1e-3

# MLP

In [None]:
X_train_used = X_train_unreduced
X_test_used = X_test_unreduced
X_val_used = X_val_unreduced
X_train_val_used = X_train_val_unreduced
X_val_test_used = X_val_test_unreduced

In [None]:
mlp = MLPClassifier([], 'relu', solver="adam")
mlp.fit(X_train_used, Y_train)

In [None]:
k_folds_x_val(mlp, X_train_val_used, Y_train_val)

In [None]:
cmat = get_cmat(mlp, X_val_test_used, Y_val_test)
print(get_metrics(mlp, X_val_test_used, Y_val_test))
plt.matshow(cmat)
plt.show()

In [None]:
from utils.neuralnet import NeuralNetwork
from utils.trainer import training_loop, evaluate

In [None]:
# For pytorch specifically we should load data to the provided dataloader and dataset classes. 
# This handles the batching for us.

pt_train_set = TensorDataset(torch.Tensor(X_train_used), torch.Tensor(Y_train.to_numpy()).long())
pt_val_set = TensorDataset(torch.Tensor(X_val_used), torch.Tensor(Y_val.to_numpy()).long())
pt_test_set = TensorDataset(torch.Tensor(X_test_used), torch.Tensor(Y_test.to_numpy()).long())
pt_val_test_set = TensorDataset(torch.Tensor(X_val_test_used), torch.Tensor(Y_val_test.to_numpy()).long())

pt_train_loader = DataLoader(
                    dataset=pt_train_set, 
                    batch_size=BATCH_SIZE, 
                    shuffle=True,
                )
pt_val_loader = DataLoader(
                    dataset=pt_val_set, 
                    batch_size=BATCH_SIZE, 
                    shuffle=True,
                )
pt_test_loader = DataLoader(
                    dataset=pt_test_set, 
                    batch_size=1, 
                    shuffle=True,
                )

pt_val_test_loader = DataLoader(
                    dataset=pt_val_test_set, 
                    batch_size=1, 
                    shuffle=True,
                )

In [None]:
# Initialize neural network
mlp = NeuralNetwork(X_train_reduced.shape[1], [], NUM_CLASSES)

training_loop(mlp, pt_train_loader, pt_val_loader, 200, LEARNING_RATE)

In [None]:
_, _, _, cmat = evaluate(model=mlp, val_dl=pt_val_test_loader)
print(cmat)
plt.matshow(cmat)
plt.show()

# K Means

In [None]:
k_list = [2, 3, 4, 5, 6, 8, 10, 12, 16, 32, 48, 64, 128, 256, 512]
dbi_score_list = []
for k in k_list:
    # Create a KMeans model with the specified number of clusters
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')

    # Fit the K-Means model to your data
    kmeans.fit(X_train)

    # Get the cluster assignments for each data point
    labels = kmeans.labels_

    # Calculate the Davies-Bouldin Index (DBI) to evaluate the clustering
    dbi_score = davies_bouldin_score(X_train, labels)
    dbi_score_list.append((k, dbi_score))

In [None]:
best_dbi = min(dbi_score_list, key = lambda i : i[1])
print(f"Highest DBI Score: K={best_dbi[0]}, DBI={best_dbi[1]}")

kmeans_best = KMeans(n_clusters=best_dbi[0], random_state=42, n_init='auto')

kmeans_best.fit(X_test)
preds = kmeans_best.predict(X_test)
cmat = confusion_matrix(Y_test, preds)

plt.bar([x for (x,y) in dbi_score_list], [y for (x,y) in dbi_score_list])
plt.matshow(cmat)
plt.show()