In [118]:
import joblib
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.model_selection import GridSearchCV
import plotly.express as px
from imblearn.combine import SMOTEENN
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score, precision_score, recall_score

import matplotlib.pyplot as plt
from joblib import dump, load
import os
import plotly.express as px

In [3]:
# Load data
df = pd.read_pickle("../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl")

# Get training data
X = df.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]

# Standardize dataset and add column names
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

In [4]:
print(df['quadrant'].to_numpy()[-8])

4.0


In [6]:
# get only the low and mid level features + segment_id
with open('pred_train_test_reduced.pkl', 'rb') as f:
    X = pickle.load(f)
X

Unnamed: 0,segment_id,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,...,mirtoolbox_hcdf_pct_10,mirtoolbox_hcdf_pct_50,mirtoolbox_hcdf_pct_90,mirtoolbox_irregularity,mirtoolbox_keyclarity,mirtoolbox_mode,mirtoolbox_novelty_mean,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.112351,-1.046724,-1.088041,0.233700,0.987719,-0.446048,-1.464034,-1.557358,-1.487096,-0.263505
1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.227565,0.286762,0.239319,1.067306,0.708461,-0.865839,0.124614,0.492786,0.512566,1.554357
2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.099883,0.068506,0.286243,0.435576,-0.056105,0.162461,0.781961,1.078765,0.652920,0.121509
3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.893512,-0.773050,0.164728,-0.943409,-0.952889,-0.384661,-1.204703,-1.281274,-1.366754,0.349013
4,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.180523,0.635769,0.243494,1.403570,-1.952294,-0.995785,-0.969958,-1.213569,-0.937755,-0.544921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.274821,2.114835,2.204076,-1.374669,0.967921,-0.105525,0.831505,0.736401,0.667299,0.706086
2278,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.756064,-0.571859,0.692644,1.280540,1.344291,-1.797737,-0.351736,-0.259996,-0.248294,1.529043
2279,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.748569,0.115731,2.404602,-1.355196,1.128581,-1.551523,-0.931308,-0.983595,-0.983935,1.196165
2280,26,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.391157,1.379277,1.444383,-1.358872,-1.224290,-0.357799,1.776104,1.771819,2.294032,-0.616783


In [17]:
classes = df['quadrant'][:X.shape[0]].to_numpy()

unique, counts = np.unique(classes, return_counts=True)

probabilities = counts / X.shape[0]
probabilities

array([0.21530886, 0.18979409, 0.28424351, 0.31065354])

In [108]:
# grid search parameters for the different classifiers
# parameters = {'priors': [probabilities]}
# parameters = {'var_smoothing': [0.1, 0.02, 0.01, 1e-3, 1e-4]}
parameters = {}

# get only the low and mid level features + segment_id
with open('pred_train_test_reduced.pkl', 'rb') as f:
    X = pickle.load(f)

# preprocess dataset
# X = pd.DataFrame(X_std, columns=X.columns)
X = X.drop(["segment_id"], axis=1)

# target value
y = df['quadrant'][:X.shape[0]]

X = StandardScaler().fit_transform(X)

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled).reset_index()
X_resampled.insert(0, 'segment_id', df["segment_id"])

# split the data according to segment_id
# store the splits as tuple (train indices, test_indices)
# for example the training indices are the first 26 segments
# and the test_indices is the last segment 27
cv = []
for i in range(25):   # previously good result with 26
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

X_resampled = X_resampled.drop(["segment_id"], axis=1)
X_resampled = X_resampled.drop(["index"], axis=1)

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(GaussianNB(), parameters, cv=cv, n_jobs=10, return_train_score=True)
gs_cv.fit(X_resampled, y_resampled)

score = gs_cv.best_score_
score

0.27883022386006917

In [85]:
gs_cv.best_params_

{'var_smoothing': 0.1}

In [4]:
c_vals = np.linspace(1, 50) / 10
c_vals

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. ])

In [110]:
degrees = np.linspace(1, 10, 10)
degrees

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [154]:
# grid search parameters for the different classifiers
parameters = {
        'C': [0.25, 0.5],
        'kernel': ['rbf'], # ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': degrees.tolist(), # [1, 2, 3, 4, 5, 6, 8, 10],
        'gamma': ['scale', 'auto']}

# get only the low and mid level features + segment_id
with open('pred_train_test_reduced.pkl', 'rb') as f:
    X = pickle.load(f)

# preprocess dataset
# X = pd.DataFrame(X_std, columns=X.columns)
X = X.drop(["segment_id"], axis=1)

# target value
y = df['quadrant'][:X.shape[0]]

X = StandardScaler().fit_transform(X)

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled).reset_index()
X_resampled.insert(0, 'segment_id', df["segment_id"])

# split the data according to segment_id
# store the splits as tuple (train indices, test_indices)
# for example the training indices are the first 26 segments
# and the test_indices is the last segment 27
cv = []
for i in range(25):   # previously good result with 26
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

X_resampled = X_resampled.drop(["segment_id"], axis=1)
X_resampled = X_resampled.drop(["index"], axis=1)

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(SVC(), parameters, cv=cv, n_jobs=10, return_train_score=True)
gs_cv.fit(X_resampled, y_resampled)

score = gs_cv.best_score_

In [155]:
score

0.5666440600733982

In [156]:
gs_cv.best_params_

{'C': 0.5, 'degree': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}

In [157]:
with open('evaluation_reduced.pkl', 'rb') as f:
    eval = pickle.load(f)

eval = eval.drop(['segment_id'], axis=1)

In [158]:
# get only the low and mid level features + segment_id
with open('pred_train_test_reduced.pkl', 'rb') as f:
    X = pickle.load(f)

In [159]:
y_eval = df['quadrant'][X.shape[0]:]

In [160]:
y_eval.shape, y_eval

((48,),
 4132    1.0
 4133    1.0
 4134    1.0
 4135    1.0
 4136    1.0
 4137    1.0
 4138    4.0
 4139    4.0
 4140    4.0
 4141    4.0
 4142    4.0
 4143    4.0
 4144    4.0
 4145    4.0
 4146    4.0
 4147    4.0
 4148    4.0
 4149    4.0
 4150    4.0
 4151    4.0
 4152    4.0
 4153    4.0
 4154    4.0
 4155    4.0
 4156    4.0
 4157    4.0
 4158    4.0
 4159    4.0
 4160    4.0
 4161    4.0
 4162    4.0
 4163    4.0
 4164    4.0
 4165    4.0
 4166    4.0
 4167    4.0
 4168    4.0
 4169    4.0
 4170    4.0
 4171    4.0
 4172    4.0
 4173    1.0
 4174    1.0
 4175    1.0
 4176    1.0
 4177    1.0
 4178    1.0
 4179    1.0
 Name: quadrant, dtype: float64)

In [161]:
eval.shape

(48, 158)

In [162]:
y_calc = gs_cv.predict(eval)

accuracy_score(y_eval, y_calc)


X has feature names, but SVC was fitted without feature names



0.0625

In [114]:
# results of the Grid Search CV
cv_results = pd.DataFrame.from_dict(gs_cv.cv_results_)
# cv_results['param_var_smoothing'] = smoothing
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,param_gamma,param_kernel,params,split0_test_score,...,split17_train_score,split18_train_score,split19_train_score,split20_train_score,split21_train_score,split22_train_score,split23_train_score,split24_train_score,mean_train_score,std_train_score
0,0.033189,0.003632,0.002445,0.001313,0.25,1.0,auto,linear,"{'C': 0.25, 'degree': 1.0, 'gamma': 'auto', 'k...",0.809524,...,0.949343,0.938889,0.934823,0.935115,0.936170,0.939623,0.942699,0.942164,0.940448,0.005030
1,0.028199,0.008749,0.003482,0.002812,0.25,1.0,auto,poly,"{'C': 0.25, 'degree': 1.0, 'gamma': 'auto', 'k...",0.738095,...,0.647280,0.637037,0.638734,0.648855,0.644101,0.633962,0.628466,0.632463,0.634696,0.009977
2,0.037086,0.004563,0.004451,0.002422,0.25,1.0,auto,rbf,"{'C': 0.25, 'degree': 1.0, 'gamma': 'auto', 'k...",0.690476,...,0.690432,0.685185,0.689013,0.696565,0.700193,0.679245,0.676525,0.667910,0.685644,0.009685
3,0.032587,0.005152,0.003373,0.002140,0.25,1.0,auto,sigmoid,"{'C': 0.25, 'degree': 1.0, 'gamma': 'auto', 'k...",0.714286,...,0.594747,0.592593,0.605214,0.601145,0.597679,0.583019,0.591497,0.595149,0.593202,0.010027
4,0.034521,0.004744,0.002637,0.001265,0.25,2.0,auto,linear,"{'C': 0.25, 'degree': 2.0, 'gamma': 'auto', 'k...",0.809524,...,0.949343,0.938889,0.934823,0.935115,0.936170,0.939623,0.942699,0.942164,0.940448,0.005030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,0.028680,0.004058,0.002521,0.000882,2,9.0,auto,sigmoid,"{'C': 2, 'degree': 9.0, 'gamma': 'auto', 'kern...",0.690476,...,0.619137,0.609259,0.610801,0.625954,0.628627,0.598113,0.611830,0.608209,0.611557,0.011806
316,0.072914,0.015654,0.002134,0.000816,2,10.0,auto,linear,"{'C': 2, 'degree': 10.0, 'gamma': 'auto', 'ker...",0.833333,...,0.979362,0.977778,0.981378,0.980916,0.978723,0.979245,0.985213,0.986940,0.983359,0.003928
317,0.037249,0.006133,0.002777,0.001277,2,10.0,auto,poly,"{'C': 2, 'degree': 10.0, 'gamma': 'auto', 'ker...",0.666667,...,0.675422,0.670370,0.674115,0.669847,0.669246,0.664151,0.658041,0.652985,0.664309,0.007942
318,0.034272,0.005093,0.003631,0.001626,2,10.0,auto,rbf,"{'C': 2, 'degree': 10.0, 'gamma': 'auto', 'ker...",0.880952,...,0.988743,0.985185,0.985102,0.980916,0.984526,0.984906,0.985213,0.981343,0.983929,0.003028


In [104]:
# plot the train and test error against number of neighbors
fig = px.line(
    cv_results,
    x="param_var_smoothing",
    y=["mean_test_score", "mean_train_score"],
    title='Scores for different smoothing values',
    labels={
        "param_var_smoothing": "Smoothing",
        "mean_test_score": "Mean Test Score",
        "mean_train_score": "Mean Train Score",
        "value": "Accuracy",
        "variable": "data"
    },
    width=800,
    height=400,
)
fig.update_traces(mode="lines")
fig.show()

In [16]:
# plot the train and test error against number of neighbors
fig = px.line(
    cv_results,
    x="param_C",
    y="mean_test_score",
    title="Train and Test Error for an increasing number of neighbors using uniform weights for neighbors",
    labels={
        "param_C": "C",
        "mean_test_score": "Mean Test Score",
        "mean_train_score": "Mean Train Score",
        "value": "Accuracy",
        "variable": "data"
    },
    width=800,
    height=400,
)
fig.update_traces(mode="lines")
fig.show()

In [28]:
# grid search parameters for the different classifiers
parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

# get only the low and mid level features + segment_id
with open('pred_train_test_reduced.pkl', 'rb') as f:
    X = pickle.load(f)

# preprocess dataset
# X = pd.DataFrame(X_std, columns=X.columns)
X = X.drop(["segment_id"], axis=1)

# target value
y = df['quadrant'][:X.shape[0]]

X = StandardScaler().fit_transform(X)

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled).reset_index()
X_resampled.insert(0, 'segment_id', df["segment_id"])

# split the data according to segment_id
# store the splits as tuple (train indices, test_indices)
# for example the training indices are the first 26 segments
# and the test_indices is the last segment 27
cv = []
for i in range(25):   # previously good result with 26
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

X_resampled = X_resampled.drop(["segment_id"], axis=1)
X_resampled = X_resampled.drop(["index"], axis=1)

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(SVC(), parameters, cv=cv, n_jobs=10, return_train_score=True)
gs_cv.fit(X_resampled, y_resampled)

score = gs_cv.best_score_

In [None]:
def evaluate_model(model, train_set_x, train_set_y, test_set_x, test_set_y):
    """

    :param model:
    :return:
    """
    train_yhat = model.predict(train_set_x)
    test_yhat = model.predict(test_set_x)

    # errors = abs(test_yhat - y_test)

    print('Model Performance Check:')
    print("***"*3)
    print(' Average Error: {:0.4f}'.format(np.mean(errors)))
    print(' Precission: {:0.4f}'.format(precision_score(test_set_y, test_yhat, average='micro')))
    print(' Recall: {:0.4f}'.format(recall_score(test_set_y, test_yhat, average='micro')))


    print("\nModel Accuracy Check:")
    print("***"*3)
    print(" Trainset Accuracy: {:0.4f}".format(accuracy_score(train_set_y, train_yhat)))
    print(" Testset Accuracy: {:0.4f}".format(accuracy_score(test_set_y, test_yhat)))