In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# All skl imports go here
from sklearn import tree   # Decision Trees
from sklearn import svm    # svm
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn import metrics
import sklearn as skl

from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam, Optimizer
from torch.nn import CrossEntropyLoss
import torch

from neuralnet import NeuralNetwork

# Data Loading

In [None]:
NUM_CLASSES = 6
CLASSES = ["sadnesss", "joy", "love", "anger", "fear"]

In [None]:
# Load all data
train_data = pd.read_csv("training_labse.csv")
test_data = pd.read_csv("test_labse.csv")
validation_data = pd.read_csv("validation_labse.csv")

# Separate X's and y's from each other
FEATURE_COLUMNS = [x for x in train_data if x.startswith("_e")]
LABEL_COLUMN = "label"

X_train = train_data[FEATURE_COLUMNS]
Y_train = train_data[LABEL_COLUMN]

X_test = test_data[FEATURE_COLUMNS]
Y_test = test_data[LABEL_COLUMN]

X_val = validation_data[FEATURE_COLUMNS]
Y_val = validation_data[LABEL_COLUMN]

# These are used to run cross validation
X_train_val = pd.concat([X_train, X_val]) 
Y_train_val = pd.concat([Y_train, Y_test])

In [None]:
# Perform pre-processing PCA on the training set
TARGET_EXPLAINED_VARIANCE = 0.95 

def perform_pca(dataset):
    pca = PCA(n_components= TARGET_EXPLAINED_VARIANCE)

    # Need to standardize the data frirst
    standardized = (dataset - dataset.mean(axis=0)) / dataset.std(axis = 0)

    pca.fit(X=standardized)
    dataset_reduced = pca.fit_transform(X=standardized)

    return pca, dataset_reduced



In [None]:
pca_train, X_train_reduced = perform_pca(X_train)
pca_tran_val, X_train_val_reduced = perform_pca(X_train_val)

# K-Folds Cross Validation

In [None]:

def k_folds_x_val(model): 
    cumulative_accuracy = 0
    cumulative_kappa = 0

    k_folds = KFold(n_splits=10)

    for i, (train_index, test_index) in enumerate(k_folds.split(X_train_val_reduced, Y_train_val)):
        model.fit(X_train_val_reduced[train_index], Y_train_val.iloc[train_index])
        
        Y_pred = model.predict(X_train_val_reduced[test_index])
        Y_true = Y_train_val.iloc[test_index]
        # Add all metrics here

        cumulative_accuracy += metrics.accuracy_score(Y_true, Y_pred)
        cumulative_kappa += metrics.cohen_kappa_score(Y_true, Y_pred)

    folds = k_folds.get_n_splits()
    print(f"Performed {folds}-fold cross validation")
    print(f"Average accuracy {cumulative_accuracy / folds}")
    print(f"Average Kappa {cumulative_kappa / folds}")


# Grid Search Hyperparameter Optimization

In [None]:
def objective(model):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    return metrics.accuracy_score(Y_test, Y_pred)

def gridSearchHPO(model, search_space):
    grid_search = GridSearchCV(estimator=model, 
                                param_grid=search_space, 
                                scoring=objective,
                                cv=5,
                                n_jobs=-1) # -1 means max amount
    grid_search.fit(X_train, Y_train)

# Decision Trees

In [None]:
decision_tree_model = tree.DecisionTreeClassifier(
    criterion='entropy', 
    splitter = 'best',
    max_depth = 5
)

In [None]:
k_folds_x_val(decision_tree_model)

In [None]:
decision_tree_model.fit(X_train_reduced, Y_train)
tree.plot_tree(decision_tree_model)

# SVMs

In [None]:
svm_classifier = svm.SVC(kernel='sigmoid') # 'precomputed', 'linear', 'poly', 'sigmoid', 'rbf'

#Train the model using the training sets
svm_classifier.fit(X_train_reduced, Y_train)

#Predict the response for test dataset
k_folds_x_val(svm_classifier)

# Pytorch Setup

In [None]:
# Pytorch specific constants
BATCH_SIZE = 32
LEARNING_RATE = 1e-3

In [None]:
# For pytorch specifically we should load data to the provided dataloader and dataset classes. 
# This handles the batching for us.

pt_train_set = TensorDataset(torch.Tensor(X_train_reduced), torch.Tensor(Y_train.to_numpy()).long())
pt_val_set = TensorDataset(torch.Tensor(pca_train.transform(X_val)), torch.Tensor(Y_val.to_numpy()).long())
pt_test_set = TensorDataset(torch.Tensor(pca_train.transform(X_test)), torch.Tensor(Y_test.to_numpy()).long())

pt_train_loader = DataLoader(
                    dataset=pt_train_set, 
                    batch_size=BATCH_SIZE, 
                    shuffle=True,
                )
pt_val_loader = DataLoader(
                    dataset=pt_val_set, 
                    batch_size=1, 
                    shuffle=True,
                )
pt_test_loader = DataLoader(
                    dataset=pt_test_set, 
                    batch_size=BATCH_SIZE, 
                    shuffle=True,
                )

In [None]:
def train_one_epoch(model: torch.nn.modules, dl: DataLoader, optimizer, loss_fn):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of
    # iter(training_loader) so that we can track the batch
    # index and do some intra-epoch reporting
    for i, data in enumerate(dl):
        # Every data instance is an input + label pair
        inputs, labels = data

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Gather data and report
        last_loss = loss.item()

    return last_loss

def training_loop(model : torch.nn.Module, dl : DataLoader, epochs = 1):
    optimizer = Adam(params=model.parameters(True), lr=LEARNING_RATE, betas=[0.9, 0.999])
    loss_fn = CrossEntropyLoss()
    for epoch in range(1, epochs + 1):
        print(f"Epoch {epoch}")

        model.train()
        train_loss = train_one_epoch(model, dl, optimizer, loss_fn)

        model.eval()
        running_vloss = 0
        # Disable gradient computation and reduce memory consumption.
        with torch.no_grad():
            for i, vdata in enumerate(pt_val_loader):
                vinputs, vlabels = vdata
                voutputs = model(vinputs)
                vloss = loss_fn(voutputs, vlabels)
                running_vloss += vloss

        avg_vloss = running_vloss / len(pt_val_loader)
        print(f"train_loss = {train_loss :.4f}, val_loss = {avg_vloss :.4f}")


# MLP

In [None]:
# Initialize neural network
mlp = NeuralNetwork(X_train_reduced.shape[1], [], NUM_CLASSES)

training_loop(mlp, pt_train_loader, 10)

# RNN

# Encoder-Only Transformer

# Test

In [None]:
model_to_test = decision_tree_model

In [None]:
#TODO: Add stuff for running the test set on the model