In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.combine import SMOTEENN
import plotly.express as px

# Use SVCs to predict high level features

In [None]:
# load the data and reset index of dataframe
df: pd.DataFrame = pd.read_pickle(
    "../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl").reset_index()

# get only the low and mid level features + segment_id
X = df.loc[:, "essentia_dissonance_mean":"mirtoolbox_roughness_pct_90"]
# target value
y = df["quadrant"]

# preprocess dataset
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

In [None]:
# add segment_id to data for filtering segments
X["segment_id"] = df["segment_id"]

In [None]:
# remove segment_id 26 and keep as test/ eval data for later
seg_26_indices = (X["segment_id"] == 26)
X_test = X[seg_26_indices].drop(["segment_id"], axis=1)
y_test = y[seg_26_indices]

X_train = X.drop(X[seg_26_indices].index, axis=0).reset_index(drop=True)
y_train = y.drop(X[seg_26_indices].index, axis=0)

In [None]:
# Combination of over- and under-sampling
# https://imbalanced-learn.org/stable/combine.html
smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
# X_resampled, y_resampled = X_train, y_train

In [None]:
# split the data according to segment_id
# store the splits as tuple (train indices, test_indices)
# 2 segments for test, the rest for training (not including segment 26)
cv = []

for i in range(24):
    train_indices = X_resampled[~X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    test_indices = X_resampled[X_resampled["segment_id"].isin([i, i + 1])].index.to_list()
    cv.append((train_indices, test_indices))

In [None]:
# remove the segment_id as we don't want it in the training data
X_resampled = X_resampled.drop(["segment_id"], axis=1)

In [None]:
# select k best features according to ANOVA F-value between label/feature (for classification tasks)
best_features = SelectKBest(score_func=f_classif, k=15).fit(X_resampled, y_resampled).get_feature_names_out()
X_select = X_resampled[best_features]

In [None]:
# parameters for grid search
params = {
    "n_neighbors": np.linspace(1, 300, 150, dtype=int).tolist(),
    "weights": ["uniform"],  # {‘uniform’, ‘distance’}
    "algorithm": ["auto"],  # {‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
}

gs_cv = GridSearchCV(KNeighborsClassifier(), params, cv=cv, return_train_score=True, n_jobs=-1)
gs_cv.fit(X_select, y_resampled)
print(gs_cv.best_score_, gs_cv.best_params_)

1 neighbor is clearly overfitting, a bigger value should be better.

When searching for n_neighbors > 50 the best one was --> 0.546534203234109 {'algorithm': 'auto', 'n_neighbors': 73, 'weights': 'uniform'}

In [None]:
# results of the Grid Search CV
cv_results = pd.DataFrame.from_dict(gs_cv.cv_results_)
cv_results.head(4)

In [None]:
# plot the train and test error against number of neighbors
fig = px.line(
    cv_results,
    x="param_n_neighbors",
    y=["mean_test_score", "mean_train_score"],
    title="Train and Test Error for an increasing number of neighbors",
    labels={
        "param_n_neighbors": "Number of Neighbors",
        "mean_test_score": "Mean Test Score",
        "mean_train_score": "Mean Train Score",
        "value": "Accuracy",
    },
    width=800,
    height=400,
)
fig.update_traces(mode="lines")
fig.show()

In [None]:
# score on held out test set, segment 26
gs_cv.score(X_test[best_features], y_test)