In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from util.pca import perform_pca
from util.k_folds_cross_val import k_folds_x_val, get_cmat, get_metrics
from util.hpo import gridSearchHPO
from util.forward_select import forward_select, forward_select_and_fit

from sklearn import tree   # Decision Trees
from sklearn import metrics
import sklearn as skl

# Data Loading

In [3]:
NUM_CLASSES = 6
CLASSES = ["sadnesss", "joy", "love", "anger", "fear"]

# Load all data
train_data = pd.read_csv("data/training_char.csv")
test_data = pd.read_csv("data/test_char.csv")
validation_data = pd.read_csv("data/validation_char.csv")

# Separate X's and y's from each other
FEATURE_COLUMNS_TRAIN = [x for x in train_data if x.startswith("_e")]
FEATURE_COLUMNS_TEST = [x for x in test_data if x.startswith("_e")]
LABEL_COLUMN = "label"

In [4]:
X_train = train_data[FEATURE_COLUMNS_TRAIN]
Y_train = train_data[LABEL_COLUMN]

X_test = test_data[FEATURE_COLUMNS_TEST]
Y_test = test_data[LABEL_COLUMN]

X_val = validation_data[FEATURE_COLUMNS_TEST]
Y_val = validation_data[LABEL_COLUMN]

# These are used to run cross validation
X_train_val = pd.concat([X_train, X_val]) 
Y_train_val = pd.concat([Y_train, Y_test])

# These are used to run val and test for Neural Nets
X_val_test = pd.concat([X_val, X_test])
Y_val_test = pd.concat([Y_val, Y_test])

In [8]:
print(X_train)

       _e0   _e1   _e2   _e3   _e4   _e5   _e6  _e7   _e8  _e9  ...  _e15990  \
0      9.0   9.0   9.0   9.0   9.0   9.0   9.0  9.0   9.0  9.0  ...      9.0   
1      0.0   0.0  13.0   0.0   0.0  22.0  22.0  0.0   0.0  0.0  ...      0.0   
2      4.0   3.0   0.0   1.0   1.0   5.0   5.0  6.0   8.0  6.0  ...      6.0   
3      9.0   1.0   7.0  13.0  13.0   0.0   0.0  5.0   1.0  5.0  ...      5.0   
4      4.0  14.0  18.0   0.0   0.0   2.0   2.0  5.0  22.0  5.0  ...      5.0   
...    ...   ...   ...   ...   ...   ...   ...  ...   ...  ...  ...      ...   
15995  NaN   NaN   NaN   NaN   NaN   NaN   NaN  NaN   NaN  NaN  ...      NaN   
15996  NaN   NaN   NaN   NaN   NaN   NaN   NaN  NaN   NaN  NaN  ...      NaN   
15997  NaN   NaN   NaN   NaN   NaN   NaN   NaN  NaN   NaN  NaN  ...      NaN   
15998  NaN   NaN   NaN   NaN   NaN   NaN   NaN  NaN   NaN  NaN  ...      NaN   
15999  NaN   NaN   NaN   NaN   NaN   NaN   NaN  NaN   NaN  NaN  ...      NaN   

       _e15991  _e15992  _e15993  _e159

# Principal Component Analysis (PCA)

In [7]:
TARGET_EXPLAINED_VARIANCE = 0.95

pca_train, X_train_reduced = perform_pca(X_train, TARGET_EXPLAINED_VARIANCE)
X_val_reduced = pca_train.transform(X_val)
X_test_redced = pca_train.transform(X_test)
X_train_val_reduced = pca_train.transform(X_train_val)
X_val_test_reduced = pca_train.transform(X_val_test)

print(f"{pca_train.n_components_} components for training")

ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

X_train_unreduced = X_train.to_numpy()
X_val_unreduced = X_val.to_numpy()
X_test_unreduced = X_test.to_numpy()
X_train_val_unreduced = X_train_val.to_numpy()
X_val_test_unreduced = X_val_test.to_numpy()

# Decision Tree Classifier

In [None]:
decision_tree_model = tree.DecisionTreeClassifier(
    criterion ='entropy', 
    splitter = 'best',
    max_depth = 256, 
    max_features = 'sqrt',
    max_leaf_nodes = 80,
)

# Grid Search Hyperparameter Optimization

In [None]:
dt_search_space = {
    # 'ccp_alpha':[0.1, 0.2, 0.4, 0.5],
    # 'min_impurity_decrease':[1.0, 0.5, 1.5, 2.0], # float
    # 'min_weight_fraction_leaf':[0.1, 0.2, 0.4, 0.5],
    # 'min_samples_leaf':[0.1, 0.2, 0.4, 0.5], 
    # 'min_samples_split':[0.1, 0.2, 0.4, 0.5],
    # 'class_weight': [],
    # 'random_state': []
    }

model_dt = decision_tree_model.fit(X_train, Y_train)

gridsearch_dt = gridSearchHPO(model=model_dt, search_space=dt_search_space)

In [None]:
gridsearch_dt.fit(X_train, Y_train)

In [None]:
print("Best Score: {}".format(gridsearch_dt.best_score_))
print("Best params: {}".format(gridsearch_dt.best_params_))

# Feature Select

In [None]:
decision_tree_model_reduced : tree.DecisionTreeClassifier = skl.base.clone(decision_tree_model)
Xt = forward_select(decision_tree_model_reduced, X_train_val_used, Y_train_val, 30)