In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.combine import SMOTEENN
import plotly.express as px
import pickle

In [3]:
# load the data and reset index of dataframe
df: pd.DataFrame = pd.read_pickle(
    "../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl").reset_index()

# get only the low and mid level features + segment_id
X = df.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]
# target value
y = df["quadrant"]

# preprocess dataset
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

In [4]:
# get features that have a correlation above some threshold
from itertools import combinations

corr_df = X.corr()

feature_combis = combinations(corr_df.columns, 2)
n_combis = len(list(combinations(corr_df.columns, 2)))

corr_features = []

for f1, f2 in feature_combis:
    corr = corr_df[f1].loc[f2]

    # strong
    if 0.9 <= abs(corr):
        corr_features.append(dict([("corr", corr), ("feature 1", f1), ("feature 2", f2)]))

corr_features_df = pd.DataFrame(corr_features)

In [5]:
corr_features_df

Unnamed: 0,corr,feature 1,feature 2
0,0.943627,essentia_strong_peak_mean,essentia_strong_peak_stdev
1,0.924393,librosa_chroma_mean_0,librosa_chroma_pct_50_0
2,0.910532,librosa_chroma_std_0,librosa_chroma_pct_90_0
3,0.956513,librosa_chroma_mean_1,librosa_chroma_pct_50_1
4,0.946475,librosa_chroma_mean_2,librosa_chroma_pct_50_2
...,...,...,...
86,0.941138,mirtoolbox_roughness_mean,mirtoolbox_roughness_pct_10
87,0.995234,mirtoolbox_roughness_mean,mirtoolbox_roughness_pct_50
88,0.960188,mirtoolbox_roughness_mean,mirtoolbox_roughness_pct_90
89,0.938342,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50


In [8]:
highest_correlation = corr_features_df[np.abs(corr_features_df['corr']) > 0.96]
redundant_features = highest_correlation['feature 1'].unique()

In [24]:
redundant_features = list(redundant_features)
redundant_features

['librosa_mfcc_mean_0',
 'librosa_mfcc_pct_10_0',
 'librosa_mfcc_pct_50_0',
 'librosa_mfcc_mean_1',
 'librosa_mfcc_mean_2',
 'librosa_mfcc_mean_3',
 'librosa_mfcc_mean_4',
 'librosa_mfcc_mean_5',
 'librosa_mfcc_mean_6',
 'librosa_mfcc_mean_7',
 'librosa_mfcc_mean_8',
 'librosa_mfcc_mean_9',
 'librosa_mfcc_mean_10',
 'librosa_mfcc_mean_11',
 'mirtoolbox_dynamics_mean',
 'mirtoolbox_dynamics_pct_50',
 'mirtoolbox_roughness_mean']

In [15]:
# load the data and reset index of dataframe
df: pd.DataFrame = pd.read_pickle(
    "../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl").reset_index()

# get only the low and mid level features + segment_id
X_nn = df.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]
X_var = X_nn.var()

In [25]:
lowest_var = X_var[X_var < 1e-3].index
lowest_var = list(lowest_var)
lowest_var

['essentia_dissonance_stdev',
 'essentia_pitch_salience_stdev',
 'mirtoolbox_novelty_pct_10',
 'mirtoolbox_novelty_pct_50',
 'mirtoolbox_roughness_mean',
 'mirtoolbox_roughness_std',
 'mirtoolbox_roughness_pct_10',
 'mirtoolbox_roughness_pct_50',
 'mirtoolbox_roughness_pct_90']

In [52]:
for low_var in lowest_var:
    redundant_features.append(low_var)
redundant_features

['librosa_mfcc_mean_0',
 'librosa_mfcc_pct_10_0',
 'librosa_mfcc_pct_50_0',
 'librosa_mfcc_mean_1',
 'librosa_mfcc_mean_2',
 'librosa_mfcc_mean_3',
 'librosa_mfcc_mean_4',
 'librosa_mfcc_mean_5',
 'librosa_mfcc_mean_6',
 'librosa_mfcc_mean_7',
 'librosa_mfcc_mean_8',
 'librosa_mfcc_mean_9',
 'librosa_mfcc_mean_10',
 'librosa_mfcc_mean_11',
 'mirtoolbox_dynamics_mean',
 'mirtoolbox_dynamics_pct_50',
 'mirtoolbox_roughness_mean',
 'essentia_dissonance_stdev',
 'essentia_pitch_salience_stdev',
 'mirtoolbox_novelty_pct_10',
 'mirtoolbox_novelty_pct_50',
 'mirtoolbox_roughness_mean',
 'mirtoolbox_roughness_std',
 'mirtoolbox_roughness_pct_10',
 'mirtoolbox_roughness_pct_50',
 'mirtoolbox_roughness_pct_90',
 'essentia_dissonance_stdev',
 'essentia_pitch_salience_stdev',
 'mirtoolbox_novelty_pct_10',
 'mirtoolbox_novelty_pct_50',
 'mirtoolbox_roughness_mean',
 'mirtoolbox_roughness_std',
 'mirtoolbox_roughness_pct_10',
 'mirtoolbox_roughness_pct_50',
 'mirtoolbox_roughness_pct_90',
 'essentia

In [54]:
with open('pred_train_test.pkl', 'rb') as f:
    train_test = pickle.load(f)

In [55]:
with open('evaluation.pkl', 'rb') as f:
    eval = pickle.load(f)

In [58]:
train_test

Unnamed: 0,segment_id,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,...,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.557358,-0.132523,-0.953893,-1.487096,-0.263505,0.037973,-0.066389,0.144988,0.050310,0.011099
1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.492786,-0.132523,-0.489550,0.512566,1.554357,-1.472847,-1.163010,-1.209892,-1.459551,-1.544004
2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.078765,-0.132523,0.217391,0.652920,0.121509,-1.109245,-0.264529,-1.293552,-1.025679,-1.039652
3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.281274,-0.112848,-0.826452,-1.366754,0.349013,-1.147165,-0.177751,-1.201103,-1.137358,-0.971610
4,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.213569,-0.132523,-0.730973,-0.937755,-0.544921,-1.144299,-0.342477,-1.183691,-1.078803,-1.029728
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.736401,-0.132523,0.317939,0.667299,0.706086,0.785422,0.874634,0.562600,0.745939,0.859051
2278,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.259996,-0.132523,-0.433302,-0.248294,1.529043,-0.054989,0.351260,-0.225325,0.058192,-0.052252
2279,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.983595,-0.132523,-0.578879,-0.983935,1.196165,-0.608549,0.667009,-0.678009,-0.780047,-0.175740
2280,26,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.771819,-0.132523,1.398023,2.294032,-0.616783,0.639713,0.749981,0.552866,0.557939,0.731791


In [56]:
eval

Unnamed: 0,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,gems_peacefulness_binary,...,mirtoolbox_novelty_pct_10,mirtoolbox_novelty_pct_50,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,mirtoolbox_roughness_mean,mirtoolbox_roughness_std,mirtoolbox_roughness_pct_10,mirtoolbox_roughness_pct_50,mirtoolbox_roughness_pct_90,segment_id
191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.132523,0.0438,-0.16563,-0.922844,0.120093,-0.254625,0.21833,0.215747,-0.079625,26.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,0.598988,1.913592,-0.369044,0.987977,0.929302,0.795709,0.880398,1.214556,26.0
193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.519669,-0.837843,0.529876,0.281344,0.211785,0.365695,0.283518,0.349306,26.0
194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-0.963262,-0.721168,0.602039,-0.71153,0.013148,-0.903063,-0.595759,-0.660937,26.0
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-1.000637,-0.831929,-1.169637,-0.499023,0.134807,-0.501288,-0.583551,-0.223332,26.0
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,0.949306,1.467228,-0.744736,0.596029,0.500324,0.570783,0.575698,0.585969,26.0
197,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.132523,-1.014504,-1.249386,0.493764,0.182183,1.8199,-0.406362,0.224012,0.563949,26.0
582,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.132523,-0.083772,0.835315,-0.930093,0.373541,-0.141661,0.430871,0.38417,0.197404,26.0
583,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.828146,0.12637,-0.064016,-0.967107,1.08907,1.20295,0.844835,0.978659,1.250045,26.0
584,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.132523,-0.505547,-0.598243,0.209728,0.323132,2.36257,-0.412841,0.151978,1.061118,26.0


In [66]:
train_test_reduced = train_test.copy()
train_test_reduced.columns = train_test.columns
eval_reduced = eval.copy()
eval_reduced.columns = eval.columns

for feat in redundant_features:
    if feat in train_test_reduced.columns:
        train_test_reduced = train_test_reduced.drop([feat], axis=1)
    if feat in eval_reduced.columns:
        eval_reduced = eval_reduced.drop([feat], axis=1)

In [67]:
train_test_reduced

Unnamed: 0,segment_id,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,...,mirtoolbox_hcdf_pct_10,mirtoolbox_hcdf_pct_50,mirtoolbox_hcdf_pct_90,mirtoolbox_irregularity,mirtoolbox_keyclarity,mirtoolbox_mode,mirtoolbox_novelty_mean,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.112351,-1.046724,-1.088041,0.233700,0.987719,-0.446048,-1.464034,-1.557358,-1.487096,-0.263505
1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.227565,0.286762,0.239319,1.067306,0.708461,-0.865839,0.124614,0.492786,0.512566,1.554357
2,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.099883,0.068506,0.286243,0.435576,-0.056105,0.162461,0.781961,1.078765,0.652920,0.121509
3,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.893512,-0.773050,0.164728,-0.943409,-0.952889,-0.384661,-1.204703,-1.281274,-1.366754,0.349013
4,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.180523,0.635769,0.243494,1.403570,-1.952294,-0.995785,-0.969958,-1.213569,-0.937755,-0.544921
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.274821,2.114835,2.204076,-1.374669,0.967921,-0.105525,0.831505,0.736401,0.667299,0.706086
2278,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.756064,-0.571859,0.692644,1.280540,1.344291,-1.797737,-0.351736,-0.259996,-0.248294,1.529043
2279,26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.748569,0.115731,2.404602,-1.355196,1.128581,-1.551523,-0.931308,-0.983595,-0.983935,1.196165
2280,26,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.391157,1.379277,1.444383,-1.358872,-1.224290,-0.357799,1.776104,1.771819,2.294032,-0.616783


In [68]:
eval_reduced

Unnamed: 0,gemmes_flow_binary,gemmes_wandering_binary,gemmes_interior_binary,gemmes_force_binary,gemmes_movement_binary,gems_sadness_binary,gems_tension_binary,gems_joyful_activation_binary,gems_power_binary,gems_peacefulness_binary,...,mirtoolbox_hcdf_pct_50,mirtoolbox_hcdf_pct_90,mirtoolbox_irregularity,mirtoolbox_keyclarity,mirtoolbox_mode,mirtoolbox_novelty_mean,mirtoolbox_novelty_std,mirtoolbox_novelty_pct_90,mirtoolbox_pulseclarity,segment_id
191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.582807,-1.147823,-0.181391,0.628631,-0.986863,-0.042366,-0.027407,-0.16563,-0.922844,26.0
192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.843535,1.029178,-0.644131,1.583067,-1.214874,1.201309,1.267401,1.913592,-0.369044,26.0
193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.63256,0.541508,-1.680825,-0.548804,0.162219,-0.877408,-1.157036,-0.837843,0.529876,26.0
194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.077859,1.453029,0.050726,-0.114011,-0.144817,-0.941908,-0.878074,-0.721168,0.602039,26.0
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.472328,1.681968,-0.806261,0.00165,-0.949676,-1.048281,-0.875171,-0.831929,-1.169637,26.0
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.682,0.031015,0.342731,0.298725,-1.256446,1.257464,1.065435,1.467228,-0.744736,26.0
197,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.44196,-2.252249,0.58429,0.485705,0.603727,-1.175996,-0.859012,-1.249386,0.493764,26.0
582,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.413976,-0.382894,-1.031833,-1.12504,0.402815,0.479141,0.313971,0.835315,-0.930093,26.0
583,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.093907,0.303411,-1.165298,-1.712432,-0.754378,0.138989,0.552934,-0.064016,-0.967107,26.0
584,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,2.710004,1.644302,-0.029374,-1.796285,-0.934942,-0.521057,-0.470548,-0.598243,0.209728,26.0


In [69]:
with open('pred_train_test_reduced.pkl', 'wb') as f:
    pickle.dump(train_test_reduced, f)

In [70]:
with open('evaluation_reduced.pkl', 'wb') as f:
    pickle.dump(eval_reduced, f)