In [1]:
import joblib
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA, KernelPCA, FastICA
from sklearn.model_selection import GridSearchCV
import plotly.express as px
from imblearn.combine import SMOTEENN
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
from joblib import dump, load
import os
import plotly.express as px

In [2]:
# Load data
df = pd.read_pickle("../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl")

# Get training data
X = df.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]

# Standardize dataset and add column names
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

In [3]:
print(df['quadrant'].to_numpy()[-8])

4.0


In [4]:
c_vals = np.linspace(1, 50) / 10
c_vals

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
       1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. ])

In [20]:
degrees = np.linspace(1, 10, 10)
degrees

array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.])

In [21]:
# grid search parameters for the different classifiers
parameters = {
        'C': [1],
        'kernel': ['poly'], # ['linear', 'poly', 'rbf', 'sigmoid'],
        'degree': degrees.tolist(), # [1, 2, 3, 4, 5, 6, 8, 10],
        'gamma': ['auto']} # ['scale', 'auto']}

# get only the low and mid level features + segment_id
with open('pred_train_test_reduced.pkl', 'rb') as f:
    X = pickle.load(f)

# preprocess dataset
# X = pd.DataFrame(X_std, columns=X.columns)
X = X.drop(["segment_id"], axis=1)

# target value
y = df['quadrant'][:X.shape[0]]

X = StandardScaler().fit_transform(X)

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled).reset_index()
X_resampled.insert(0, 'segment_id', df["segment_id"])

# split the data according to segment_id
# store the splits as tuple (train indices, test_indices)
# for example the training indices are the first 26 segments
# and the test_indices is the last segment 27
cv = []
for i in range(25):   # previously good result with 26
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

X_resampled = X_resampled.drop(["segment_id"], axis=1)
X_resampled = X_resampled.drop(["index"], axis=1)

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(SVC(), parameters, cv=cv, n_jobs=10, return_train_score=True)
gs_cv.fit(X_resampled, y_resampled)

score = gs_cv.best_score_

In [22]:
score

0.618921340825626

In [23]:
gs_cv.best_params_

{'C': 1, 'degree': 3.0, 'gamma': 'auto', 'kernel': 'poly'}

In [7]:
with open('evaluation_reduced.pkl', 'rb') as f:
    eval = pickle.load(f)

eval = eval.drop(['segment_id'], axis=1)

In [8]:
# get only the low and mid level features + segment_id
with open('pred_train_test_reduced.pkl', 'rb') as f:
    X = pickle.load(f)

In [9]:
y_eval = df['quadrant'][X.shape[0]:]

In [10]:
y_eval.shape, y_eval

((48,),
 4132    1.0
 4133    1.0
 4134    1.0
 4135    1.0
 4136    1.0
 4137    1.0
 4138    4.0
 4139    4.0
 4140    4.0
 4141    4.0
 4142    4.0
 4143    4.0
 4144    4.0
 4145    4.0
 4146    4.0
 4147    4.0
 4148    4.0
 4149    4.0
 4150    4.0
 4151    4.0
 4152    4.0
 4153    4.0
 4154    4.0
 4155    4.0
 4156    4.0
 4157    4.0
 4158    4.0
 4159    4.0
 4160    4.0
 4161    4.0
 4162    4.0
 4163    4.0
 4164    4.0
 4165    4.0
 4166    4.0
 4167    4.0
 4168    4.0
 4169    4.0
 4170    4.0
 4171    4.0
 4172    4.0
 4173    1.0
 4174    1.0
 4175    1.0
 4176    1.0
 4177    1.0
 4178    1.0
 4179    1.0
 Name: quadrant, dtype: float64)

In [11]:
eval.shape

(48, 158)

In [12]:
y_calc = gs_cv.predict(eval)

accuracy_score(y_eval, y_calc)



0.0625

In [29]:
# results of the Grid Search CV
cv_results = pd.DataFrame.from_dict(gs_cv.cv_results_)
cv_results # ['mean_train_score']

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split17_train_score,split18_train_score,split19_train_score,split20_train_score,split21_train_score,split22_train_score,split23_train_score,split24_train_score,mean_train_score,std_train_score
0,0.053134,0.008831,0.002208,0.000353,linear,{'kernel': 'linear'},0.833333,0.789474,0.758621,0.857143,...,0.977486,0.972222,0.972067,0.969466,0.972921,0.969811,0.968577,0.972015,0.974472,0.003664
1,0.030777,0.00347,0.002264,0.000265,poly,{'kernel': 'poly'},0.880952,0.894737,0.862069,0.928571,...,0.908068,0.9,0.905028,0.916031,0.905222,0.9,0.89464,0.893657,0.895701,0.009155
2,0.037062,0.00386,0.003643,0.001646,rbf,{'kernel': 'rbf'},0.857143,0.815789,0.827586,0.928571,...,0.923077,0.933333,0.938547,0.935115,0.926499,0.941509,0.937153,0.934701,0.933631,0.005957
3,0.030606,0.004039,0.002545,0.000625,sigmoid,{'kernel': 'sigmoid'},0.690476,0.657895,0.724138,0.714286,...,0.619137,0.616667,0.610801,0.629771,0.628627,0.607547,0.606285,0.617537,0.6062,0.011492


In [32]:
# plot the train and test error against number of neighbors
fig = px.line(
    cv_results,
    x="param_kernel",
    y=["mean_test_score", "mean_train_score"],
    title='Scores for different Kernel types',
    labels={
        "param_kernel": "Kernel type",
        "mean_test_score": "Mean Test Score",
        "mean_train_score": "Mean Train Score",
        "value": "Accuracy",
        "variable": "data"
    },
    width=800,
    height=400,
)
fig.update_traces(mode="lines")
fig.show()

In [16]:
# plot the train and test error against number of neighbors
fig = px.line(
    cv_results,
    x="param_C",
    y="mean_test_score",
    title="Train and Test Error for an increasing number of neighbors using uniform weights for neighbors",
    labels={
        "param_C": "C",
        "mean_test_score": "Mean Test Score",
        "mean_train_score": "Mean Train Score",
        "value": "Accuracy",
        "variable": "data"
    },
    width=800,
    height=400,
)
fig.update_traces(mode="lines")
fig.show()

In [28]:
# grid search parameters for the different classifiers
parameters = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

# get only the low and mid level features + segment_id
with open('pred_train_test_reduced.pkl', 'rb') as f:
    X = pickle.load(f)

# preprocess dataset
# X = pd.DataFrame(X_std, columns=X.columns)
X = X.drop(["segment_id"], axis=1)

# target value
y = df['quadrant'][:X.shape[0]]

X = StandardScaler().fit_transform(X)

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
X_resampled = pd.DataFrame(X_resampled).reset_index()
X_resampled.insert(0, 'segment_id', df["segment_id"])

# split the data according to segment_id
# store the splits as tuple (train indices, test_indices)
# for example the training indices are the first 26 segments
# and the test_indices is the last segment 27
cv = []
for i in range(25):   # previously good result with 26
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

X_resampled = X_resampled.drop(["segment_id"], axis=1)
X_resampled = X_resampled.drop(["index"], axis=1)

# grid search the parameters for a given classifier
gs_cv = GridSearchCV(SVC(), parameters, cv=cv, n_jobs=10, return_train_score=True)
gs_cv.fit(X_resampled, y_resampled)

score = gs_cv.best_score_