In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# All skl imports go here
from sklearn import tree   # Decision Trees
from sklearn import svm    # svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn import metrics
import sklearn as skl

In [2]:
NUM_CLASSES = 6
CLASSES = ["sadnesss", "joy", "love", "anger", "fear"]

In [3]:
# Load all data
train_data = pd.read_csv("data/training_bert.csv")
test_data = pd.read_csv("data/test_bert.csv")
validation_data = pd.read_csv("data/validation_bert.csv")

# Separate X's and y's from each other
FEATURE_COLUMNS = [x for x in train_data if x.startswith("_e")]
LABEL_COLUMN = "label"

X_train = train_data[FEATURE_COLUMNS]
Y_train = train_data[LABEL_COLUMN]

X_test = test_data[FEATURE_COLUMNS]
Y_test = test_data[LABEL_COLUMN]

X_val = validation_data[FEATURE_COLUMNS]
Y_val = validation_data[LABEL_COLUMN]

# These are used to run cross validation
X_train_val = pd.concat([X_train, X_val]) 
Y_train_val = pd.concat([Y_train, Y_test])

# These are used to run val and test for Neural Nets
X_val_test = pd.concat([X_val, X_test])
Y_val_test = pd.concat([Y_val, Y_test])

In [4]:
# Perform pre-processing PCA on the training set
def perform_pca(dataset, target_variance):
    pca = PCA(n_components= target_variance)

    # Need to standardize the data frirst
    standardized = (dataset - dataset.mean(axis=0)) / dataset.std(axis = 0)

    pca.fit(X=standardized)
    dataset_reduced = pca.fit_transform(X=standardized)

    return pca, dataset_reduced

In [5]:
TARGET_EXPLAINED_VARIANCE = 100

pca_train, X_train_reduced = perform_pca(X_train, TARGET_EXPLAINED_VARIANCE)
X_val_reduced = pca_train.transform(X_val)
X_test_reduced = pca_train.transform(X_test)

print(f"{pca_train.n_components_} components for training")

100 components for training


In [6]:
# Create DataFrames with reduced data and corresponding target labels
columns_reduced = [f"e_{i}" for i in range(pca_train.n_components_)]
df_train = pd.DataFrame(data={'label': Y_train, **dict(zip(columns_reduced, X_train_reduced.T))})
df_val = pd.DataFrame(data={'label': Y_val, **dict(zip(columns_reduced, X_val_reduced.T))})
df_test = pd.DataFrame(data={'label': Y_test, **dict(zip(columns_reduced, X_test_reduced.T))})

# Print the number of components for training
print(f"{pca_train.n_components_} components for training")

# Save the DataFrames into CSV files
df_train.to_csv('data/training_bert_reduced.csv', index=False)
df_val.to_csv('data/validation_bert_reduced.csv', index=False)
df_test.to_csv('data/test_bert_reduced.csv', index=False)

100 components for training
