# ...

### Imports

In [38]:
import joblib
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.model_selection import GridSearchCV
import plotly.express as px
from imblearn.combine import SMOTEENN

import matplotlib.pyplot as plt
from joblib import dump, load
import os

### Data

In [41]:
# Load data
df = pd.read_pickle("../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl")

df

Unnamed: 0,pianist_id,segment_id,snippet_id,essentia_dissonance_mean,essentia_dissonance_stdev,essentia_dynamic_complexity,essentia_loudness,essentia_onset_rate,essentia_pitch_salience_mean,essentia_pitch_salience_stdev,...,gems_peacefulness_binary,gems_power_binary,gems_joyful_activation_binary,gems_tension_binary,gems_sadness_binary,gemmes_movement_binary,gemmes_force_binary,gemmes_interior_binary,gemmes_wandering_binary,gemmes_flow_binary
0,1,0,0,0.192237,0.059404,2.040252,16079768.0,0.800000,0.677256,0.095846,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1,0,1,0.143425,0.064204,3.138845,35489248.0,2.600000,0.514913,0.092372,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,1,0,2,0.146967,0.056205,2.019706,42130144.0,2.600000,0.560116,0.115359,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,1,0,3,0.158810,0.059129,3.567908,40922732.0,1.400000,0.596779,0.116061,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1,0,4,0.168547,0.049648,2.329854,51921612.0,1.600000,0.556160,0.143420,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4175,11,26,2,0.202177,0.050760,2.840705,51977460.0,1.800000,0.688834,0.090235,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4176,11,26,3,0.174454,0.063426,5.202462,9471357.0,3.200000,0.661253,0.107627,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4177,11,26,4,0.161152,0.074724,4.605277,7378860.5,2.600000,0.645111,0.093705,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4178,11,26,5,0.192846,0.056795,2.563405,22641664.0,3.200000,0.680604,0.094989,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [42]:
# Get training data
X = df.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]

# Standardize dataset and add column names
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

X

Unnamed: 0,essentia_dissonance_mean,essentia_dissonance_stdev,essentia_dynamic_complexity,essentia_loudness,essentia_onset_rate,essentia_pitch_salience_mean,essentia_pitch_salience_stdev,essentia_spectral_centroid_mean,essentia_spectral_centroid_stdev,essentia_spectral_complexity_mean,...,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90
0,0.074674,-0.208634,-0.502540,-0.624442,-0.967537,0.973840,-0.271925,0.821937,3.705908,1.435001,...,-1.557358,-0.132523,-0.953893,-1.487096,-0.263505,0.037973,-0.066389,0.144988,0.050310,0.011099
1,-1.003098,0.364099,0.130841,-0.318021,0.019897,-2.043147,-0.449124,0.322515,0.242491,-1.242223,...,0.492786,-0.132523,-0.489550,0.512566,1.554357,-1.472847,-1.163010,-1.209892,-1.459551,-1.544004
2,-0.924909,-0.590335,-0.514385,-0.213180,0.019897,-1.203086,0.723664,0.060573,-0.316268,-0.968771,...,1.078765,-0.132523,0.217391,0.652920,0.121509,-1.109245,-0.264529,-1.293552,-1.025679,-1.039652
3,-0.663400,-0.241471,0.378212,-0.232242,-0.638392,-0.521754,0.759491,0.142514,0.058750,0.156288,...,-1.281274,-0.112848,-0.826452,-1.366754,0.349013,-1.147165,-0.177751,-1.201103,-1.137358,-0.971610
4,-0.448402,-1.372774,-0.335574,-0.058600,-0.528677,-1.276604,2.155321,-0.075302,-0.236369,0.179727,...,-1.213569,-0.132523,-0.730973,-0.937755,-0.544921,-1.144299,-0.342477,-1.183691,-1.078803,-1.029728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,0.294153,-1.240041,-0.041048,-0.057718,-0.418962,1.188996,-0.558156,-0.524739,-0.573189,0.705796,...,0.736401,-0.132523,0.317939,0.667299,0.706086,0.785422,0.874634,0.562600,0.745939,0.859051
2278,-0.317970,0.271249,1.320595,-0.728771,0.349042,0.676441,0.329168,-0.644184,-0.031043,1.216241,...,-0.259996,-0.132523,-0.433302,-0.248294,1.529043,-0.054989,0.351260,-0.225325,0.058192,-0.052252
2279,-0.611698,1.619394,0.976295,-0.761806,0.019897,0.376446,-0.381145,-0.604478,-0.075596,0.614646,...,-0.983595,-0.132523,-0.578879,-0.983935,1.196165,-0.608549,0.667009,-0.678009,-0.780047,-0.175740
2280,0.088125,-0.520028,-0.200922,-0.520848,0.349042,1.036064,-0.315627,-1.000535,-0.315062,0.750070,...,1.771819,-0.132523,1.398023,2.294032,-0.616783,0.639713,0.749981,0.552866,0.557939,0.731791


### Experiments

In [43]:
hl_feat = ["gems_wonder_binary", "gems_transcendence_binary", "gems_tenderness_binary", "gems_nostalgia_binary", "gems_peacefulness_binary",
           "gems_power_binary", "gems_joyful_activation_binary", "gems_tension_binary", "gems_sadness_binary", "gemmes_movement_binary",
           "gemmes_force_binary", "gemmes_interior_binary", "gemmes_wandering_binary", 'gemmes_flow_binary']

In [49]:
# grid search parameters for the different classifiers
parameters = {
        'C': [0.5, 1, 1.25],
        'kernel': ['poly', 'rbf', 'sigmoid'],
        'degree': [3, 5, 6, 8, 10],
        'gamma': ['scale', 'auto']}

# load the data and reset index of dataframe
df: pd.DataFrame = pd.read_pickle(
    "../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl").reset_index()

models = []

for feature in hl_feat:
    # get only the low and mid level features + segment_id
    X = df.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]

    # preprocess dataset
    X_std = StandardScaler().fit_transform(X)
    X = pd.DataFrame(X_std, columns=X.columns)
    X["segment_id"] = df["segment_id"]

    # target value
    y = df[feature]

    # smote_enn = SMOTEENN(random_state=0)
    # X_resampled, y_resampled = smote_enn.fit_resample(X, y)

    # split the data according to segment_id
    # store the splits as tuple (train indices, test_indices)
    # for example the training indices are the first 26 segments
    # and the test_indices is the last segment 27
    cv = []
    for i in range(26):
        train_indices = X[~X["segment_id"].isin([i, i + 1])].index.to_list()
        test_indices = X[X["segment_id"].isin([i, i + 1])].index.to_list()
        cv.append((train_indices, test_indices))

    X = X.drop(["segment_id"], axis=1)

    # grid search the parameters for a given classifier
    gs_cv = GridSearchCV(SVC(), parameters, cv=cv, n_jobs=10)
    gs_cv.fit(X, y)

    score = gs_cv.best_score_
    print(f"{feature} - SVM {score: >5}")
    models.append(gs_cv)

gems_wonder_binary - SVM 0.5517888252221723
gems_transcendence_binary - SVM 0.7196152255038567
gems_tenderness_binary - SVM 0.7501231230192364
gems_nostalgia_binary - SVM 0.7589490514924874
gems_peacefulness_binary - SVM 0.7983150124149142
gems_power_binary - SVM 0.816436213392014
gems_joyful_activation_binary - SVM 0.8530225900352119
gems_tension_binary - SVM 0.7898296567621872
gems_sadness_binary - SVM 0.8866999196828652
gemmes_movement_binary - SVM 0.7584048108301356
gemmes_force_binary - SVM 0.8339859264057593
gemmes_interior_binary - SVM 0.7558733097299519
gemmes_wandering_binary - SVM 0.7384527636142469
gemmes_flow_binary - SVM 0.7625774148062763


In [50]:
scores = [model.best_score_ for model in models]
np.mean(scores[:])

0.769576703065094

In [51]:
for model in models:
    print(model.best_params_)

{'C': 0.5, 'degree': 8, 'gamma': 'auto', 'kernel': 'poly'}
{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 0.5, 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid'}
{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 1.25, 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf'}
{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid'}
{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 1, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 1.25, 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid'}
{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 1, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid'}


In [48]:
for i, model in enumerate(models):
    dump(model, os.path.join('models', hl_feat[i]))

In [52]:
from joblib import dump, load
import os

for i, model in enumerate(models):
    old_val = load(os.path.join('models', hl_feat[i])).best_score_
    new_val = model.best_score_

    if new_val > old_val:
        dump(model, os.path.join('models', hl_feat[i]))

In [9]:
vals = [load(os.path.join('models', feat)).best_score_ for feat in hl_feat]
np.mean(vals)

0.7930761707742645

# ölakdnffdklgklqngqlerknfü

In [10]:
load('models/gemmes_interior_binary').best_params_

{'C': 0.5, 'degree': 3, 'gamma': 'scale', 'kernel': 'sigmoid'}

In [53]:
X

Unnamed: 0,essentia_dissonance_mean,essentia_dissonance_stdev,essentia_dynamic_complexity,essentia_loudness,essentia_onset_rate,essentia_pitch_salience_mean,essentia_pitch_salience_stdev,essentia_spectral_centroid_mean,essentia_spectral_centroid_stdev,essentia_spectral_complexity_mean,...,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90
0,0.074674,-0.208634,-0.502540,-0.624442,-0.967537,0.973840,-0.271925,0.821937,3.705908,1.435001,...,-1.557358,-0.132523,-0.953893,-1.487096,-0.263505,0.037973,-0.066389,0.144988,0.050310,0.011099
1,-1.003098,0.364099,0.130841,-0.318021,0.019897,-2.043147,-0.449124,0.322515,0.242491,-1.242223,...,0.492786,-0.132523,-0.489550,0.512566,1.554357,-1.472847,-1.163010,-1.209892,-1.459551,-1.544004
2,-0.924909,-0.590335,-0.514385,-0.213180,0.019897,-1.203086,0.723664,0.060573,-0.316268,-0.968771,...,1.078765,-0.132523,0.217391,0.652920,0.121509,-1.109245,-0.264529,-1.293552,-1.025679,-1.039652
3,-0.663400,-0.241471,0.378212,-0.232242,-0.638392,-0.521754,0.759491,0.142514,0.058750,0.156288,...,-1.281274,-0.112848,-0.826452,-1.366754,0.349013,-1.147165,-0.177751,-1.201103,-1.137358,-0.971610
4,-0.448402,-1.372774,-0.335574,-0.058600,-0.528677,-1.276604,2.155321,-0.075302,-0.236369,0.179727,...,-1.213569,-0.132523,-0.730973,-0.937755,-0.544921,-1.144299,-0.342477,-1.183691,-1.078803,-1.029728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,0.294153,-1.240041,-0.041048,-0.057718,-0.418962,1.188996,-0.558156,-0.524739,-0.573189,0.705796,...,0.736401,-0.132523,0.317939,0.667299,0.706086,0.785422,0.874634,0.562600,0.745939,0.859051
2278,-0.317970,0.271249,1.320595,-0.728771,0.349042,0.676441,0.329168,-0.644184,-0.031043,1.216241,...,-0.259996,-0.132523,-0.433302,-0.248294,1.529043,-0.054989,0.351260,-0.225325,0.058192,-0.052252
2279,-0.611698,1.619394,0.976295,-0.761806,0.019897,0.376446,-0.381145,-0.604478,-0.075596,0.614646,...,-0.983595,-0.132523,-0.578879,-0.983935,1.196165,-0.608549,0.667009,-0.678009,-0.780047,-0.175740
2280,0.088125,-0.520028,-0.200922,-0.520848,0.349042,1.036064,-0.315627,-1.000535,-0.315062,0.750070,...,1.771819,-0.132523,1.398023,2.294032,-0.616783,0.639713,0.749981,0.552866,0.557939,0.731791


In [54]:
training_data = X.copy()
print(training_data)

for model, feature in zip(models, hl_feat):
     pred = model.predict(X)
     training_data.insert(0, feature, pred)

training_data

      essentia_dissonance_mean  essentia_dissonance_stdev  \
0                     0.074674                  -0.208634   
1                    -1.003098                   0.364099   
2                    -0.924909                  -0.590335   
3                    -0.663400                  -0.241471   
4                    -0.448402                  -1.372774   
...                        ...                        ...   
2277                  0.294153                  -1.240041   
2278                 -0.317970                   0.271249   
2279                 -0.611698                   1.619394   
2280                  0.088125                  -0.520028   
2281                  0.435541                   3.126546   

      essentia_dynamic_complexity  essentia_loudness  essentia_onset_rate  \
0                       -0.502540          -0.624442            -0.967537   
1                        0.130841          -0.318021             0.019897   
2                       -0.514385   

Unnamed: 0,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,gems_peacefulness_binary,...,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.557358,-0.132523,-0.953893,-1.487096,-0.263505,0.037973,-0.066389,0.144988,0.050310,0.011099
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.492786,-0.132523,-0.489550,0.512566,1.554357,-1.472847,-1.163010,-1.209892,-1.459551,-1.544004
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.078765,-0.132523,0.217391,0.652920,0.121509,-1.109245,-0.264529,-1.293552,-1.025679,-1.039652
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.281274,-0.112848,-0.826452,-1.366754,0.349013,-1.147165,-0.177751,-1.201103,-1.137358,-0.971610
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-1.213569,-0.132523,-0.730973,-0.937755,-0.544921,-1.144299,-0.342477,-1.183691,-1.078803,-1.029728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.736401,-0.132523,0.317939,0.667299,0.706086,0.785422,0.874634,0.562600,0.745939,0.859051
2278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.259996,-0.132523,-0.433302,-0.248294,1.529043,-0.054989,0.351260,-0.225325,0.058192,-0.052252
2279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.983595,-0.132523,-0.578879,-0.983935,1.196165,-0.608549,0.667009,-0.678009,-0.780047,-0.175740
2280,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.771819,-0.132523,1.398023,2.294032,-0.616783,0.639713,0.749981,0.552866,0.557939,0.731791


In [55]:
with open('train_data_pred_features.pkl', 'wb') as f:
    pickle.dump(training_data, f)

# Section

In [14]:
# target value
y = df['quadrant']

# Decision Tree
parameters =     {
        'n_neighbors': [3, 6, 10, 15, 25, 30],
        'weights': ['uniform', 'distance'],
        'leaf_size': [10, 15, 20, 30, 40, 50]
    },

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(KNeighborsClassifier(), parameters, cv=cv, n_jobs=10)

# Original data
gs_cv.fit(X, y)

print(gs_cv.best_score_)
print(gs_cv.best_params_)

# Additional Features
gs_cv.fit(training_data, y)

print(gs_cv.best_score_)
print(gs_cv.best_params_)


0.5446744868939188
{'leaf_size': 10, 'n_neighbors': 25, 'weights': 'distance'}
0.5493903208580561
{'leaf_size': 10, 'n_neighbors': 15, 'weights': 'distance'}


In [15]:
# target value
y = df['quadrant']

# Decision Tree
parameters = {'max_iter': [100, 250, 600, 1000],
              'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
              }

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(LogisticRegression(),
                     parameters,
                     cv=cv,
                     n_jobs=10)

# Original data
gs_cv.fit(X, y)

print(gs_cv.best_score_)
print(gs_cv.best_params_)

# Additional Features
gs_cv.fit(training_data, y)

print(gs_cv.best_score_)
print(gs_cv.best_params_)



0.4634787264514396
{'max_iter': 100, 'solver': 'saga'}
0.5139943461897007
{'max_iter': 100, 'solver': 'saga'}




In [18]:
pca = KernelPCA(n_components=4, kernel='linear').fit_transform(training_data)
fig = px.scatter(x=pca[:, 0], y=pca[:, 1], color=y)
fig.show()

In [20]:
# target value
y = df['quadrant']

# Decision Tree
parameters =     {},

# grid search the parameters for a given classifier
# gs_cv = GridSearchCV(DecisionTreeClassifier(), parameters, cv=cv, n_jobs=10)
gs_cv = GridSearchCV(KNeighborsClassifier(), parameters, cv=cv, n_jobs=10)

# PCA data
gs_cv.fit(pca, y)

print(gs_cv.best_score_)
print(gs_cv.best_params_)

0.48574865901514475
{}


In [22]:
ica = FastICA(n_components=4).fit_transform(training_data)
fig = px.scatter(x=ica[:, 0], y=ica[:, 1], color=y)
fig.show()

In [24]:
# target value
y = df['quadrant']

# Decision Tree
parameters =     {
        'n_neighbors': [3, 6, 10, 15, 25, 30],
        'weights': ['uniform', 'distance'],
        'leaf_size': [10, 15, 20, 30, 40, 50]
    },

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(KNeighborsClassifier(), parameters, cv=cv, n_jobs=10)

# PCA data
gs_cv.fit(ica, y)

print(gs_cv.best_score_)
print(gs_cv.best_params_)

0.49649267004439757
{'leaf_size': 10, 'n_neighbors': 25, 'weights': 'distance'}


In [26]:
# target value
y = df['quadrant']

# Decision Tree
parameters = {}

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(SVC(), parameters, cv=cv, n_jobs=10)

# PCA data
gs_cv.fit(training_data, y)

print(gs_cv.best_score_)
print(gs_cv.best_params_)

0.5501915871347425
{}


# SVM

In [34]:
# grid search parameters for the different classifiers
parameters = {
        'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': [1, 2, 3, 4, 5, 6, 8, 10],
        'gamma': ['scale', 'auto']}

# load the data and reset index of dataframe
df: pd.DataFrame = pd.read_pickle(
    "../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl").reset_index()

# get only the low and mid level features + segment_id
X = df.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]

# preprocess dataset
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)
X["segment_id"] = df["segment_id"]

# target value
y = df['quadrant']

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# split the data according to segment_id
# store the splits as tuple (train indices, test_indices)
# for example the training indices are the first 26 segments
# and the test_indices is the last segment 27
cv = []
for i in range(26):
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

X = X.drop(["segment_id"], axis=1)

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(SVC(), parameters, cv=cv, n_jobs=10)
gs_cv.fit(X_resampled, y_resampled)

score = gs_cv.best_score_

In [35]:
score, gs_cv.best_params_

(0.5347839775021135,
 {'C': 1.75, 'degree': 2, 'gamma': 'auto', 'kernel': 'poly'})

In [36]:
# grid search parameters for the different classifiers
parameters = {
        'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5],
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': [1, 2, 3, 4, 5, 6, 8, 10],
        'gamma': ['scale', 'auto']}

# load the data and reset index of dataframe
df: pd.DataFrame = pd.read_pickle(
    "../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl").reset_index()

# get only the low and mid level features + segment_id
with open('train_data_pred_features.pkl', 'rb') as f:
    X = pickle.load(f)

# preprocess dataset
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)
X["segment_id"] = df["segment_id"]

# target value
y = df['quadrant']

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

# split the data according to segment_id
# store the splits as tuple (train indices, test_indices)
# for example the training indices are the first 26 segments
# and the test_indices is the last segment 27
cv = []
for i in range(26):
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

X = X.drop(["segment_id"], axis=1)

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(SVC(), parameters, cv=cv, n_jobs=10)
gs_cv.fit(X_resampled, y_resampled)

score = gs_cv.best_score_

In [37]:
score, gs_cv.best_params_

(0.5507516757755186,
 {'C': 1.75, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf'})

In [39]:
dump(gs_cv, 'SVM_model')

['SVM_model']

In [40]:
gs_cv.best_params_

{'C': 1.75, 'degree': 1, 'gamma': 'auto', 'kernel': 'rbf'}

# Train-Test and Evaluation Split

In [57]:
df_pred = pd.read_pickle("train_data_pred_features.pkl").reset_index()
df_pred["segment_id"] = df["segment_id"]
df_pred

Unnamed: 0,index,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,...,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90,segment_id
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.953893,-1.487096,-0.263505,0.037973,-0.066389,0.144988,0.050310,0.011099,0
1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.489550,0.512566,1.554357,-1.472847,-1.163010,-1.209892,-1.459551,-1.544004,0
2,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,0.217391,0.652920,0.121509,-1.109245,-0.264529,-1.293552,-1.025679,-1.039652,0
3,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.112848,-0.826452,-1.366754,0.349013,-1.147165,-0.177751,-1.201103,-1.137358,-0.971610,0
4,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.730973,-0.937755,-0.544921,-1.144299,-0.342477,-1.183691,-1.078803,-1.029728,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,2277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,0.317939,0.667299,0.706086,0.785422,0.874634,0.562600,0.745939,0.859051,26
2278,2278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.433302,-0.248294,1.529043,-0.054989,0.351260,-0.225325,0.058192,-0.052252,26
2279,2279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.578879,-0.983935,1.196165,-0.608549,0.667009,-0.678009,-0.780047,-0.175740,26
2280,2280,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,1.398023,2.294032,-0.616783,0.639713,0.749981,0.552866,0.557939,0.731791,26


In [59]:
df_train_test = df_pred[df['segment_id'] != 26]
df_train_test

Unnamed: 0,index,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,...,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90,segment_id
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.953893,-1.487096,-0.263505,0.037973,-0.066389,0.144988,0.050310,0.011099,0
1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.489550,0.512566,1.554357,-1.472847,-1.163010,-1.209892,-1.459551,-1.544004,0
2,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,0.217391,0.652920,0.121509,-1.109245,-0.264529,-1.293552,-1.025679,-1.039652,0
3,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.112848,-0.826452,-1.366754,0.349013,-1.147165,-0.177751,-1.201103,-1.137358,-0.971610,0
4,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.730973,-0.937755,-0.544921,-1.144299,-0.342477,-1.183691,-1.078803,-1.029728,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2270,2270,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.941338,-1.611833,-0.591424,-0.914903,-0.757005,-0.814934,-0.913750,-1.014191,25
2271,2271,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.117626,-0.452063,-0.647481,-0.135331,-0.006219,-0.143368,-0.141395,-0.060960,25
2272,2272,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.899636,-1.233907,-0.879324,-0.743451,-0.452173,-0.651969,-0.734084,-0.779635,25
2273,2273,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,0.013679,1.141670,0.114933,0.003332,-0.158253,0.183755,-0.070832,0.011211,25


In [60]:
df_eval = df_pred[df['segment_id'] == 26]
df_eval

Unnamed: 0,index,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,...,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90,segment_id
191,191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.132523,0.043800,-0.165630,-0.922844,0.120093,-0.254625,0.218330,0.215747,-0.079625,26
192,192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,0.598988,1.913592,-0.369044,0.987977,0.929302,0.795709,0.880398,1.214556,26
193,193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.519669,-0.837843,0.529876,0.281344,0.211785,0.365695,0.283518,0.349306,26
194,194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.963262,-0.721168,0.602039,-0.711530,0.013148,-0.903063,-0.595759,-0.660937,26
195,195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-1.000637,-0.831929,-1.169637,-0.499023,0.134807,-0.501288,-0.583551,-0.223332,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,2277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,0.317939,0.667299,0.706086,0.785422,0.874634,0.562600,0.745939,0.859051,26
2278,2278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.433302,-0.248294,1.529043,-0.054989,0.351260,-0.225325,0.058192,-0.052252,26
2279,2279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.578879,-0.983935,1.196165,-0.608549,0.667009,-0.678009,-0.780047,-0.175740,26
2280,2280,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,1.398023,2.294032,-0.616783,0.639713,0.749981,0.552866,0.557939,0.731791,26


In [61]:
with open('pred_train_test.pkl', 'wb') as f:
    pickle.dump(df_train_test, f)
    
with open('evaluation.pkl', 'wb') as f:
    pickle.dump(df_eval, f)