In [75]:
import numpy as np
import pandas as pd
import skops.io as sio
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [7]:
np.random.seed(42)

In [44]:
DATA_PATH = "data/subgenres_df.csv"

df_small_genres = pd.read_csv(DATA_PATH, index_col="Unnamed: 0")

### Разделить датасет на train, test

In [69]:
train_ids = pd.read_csv(
    "data/train_test_ids/subgenres/train_ids.csv", index_col="Unnamed: 0"
)
test_ids = pd.read_csv(
    "data/train_test_ids/subgenres/test_ids.csv", index_col="Unnamed: 0"
)

In [71]:
X_train_df = df_small_genres[df_small_genres["track_id"].isin(train_ids["track_id"])]
X_test_df = df_small_genres[df_small_genres["track_id"].isin(test_ids["track_id"])]

y_train = X_train_df.genre.to_numpy()
y_test = X_test_df.genre.to_numpy()
X_train_df = X_train_df.drop(["genre", "track_id"], axis=1)
X_test_df = X_test_df.drop(["genre", "track_id"], axis=1)

### Подбор гиперпараметров

### Обучаем классификатор

##### Препроцессинг

In [66]:
class DataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.preprocessor = ColumnTransformer(
            transformers=[
                (
                    "categorical features encoder",
                    OneHotEncoder(drop="first"),
                    self.columns,
                ),
            ],
            remainder="passthrough",
        )

    def fit(self, X, y=None):
        self.preprocessor.fit(X)
        return self

    def transform(self, X, y=None):
        return self.preprocessor.transform(X)


CATEGORICAL_FEATURES = ["track_explicit", "key", "mode", "time_signature"]


def get_preprocessor(cat_features=CATEGORICAL_FEATURES):
    preproc = DataTransformer(columns=CATEGORICAL_FEATURES)
    return preproc

#### Подбор гиперпараметров

In [72]:
preproc = get_preprocessor()
preproc.fit(X_train_df)
X_train = preproc.transform(X_train_df)

In [76]:
kfolds = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
cv = kfolds.split(X_train, y_train)

params = {
    "criterion": ["gini", "entropy"],
    "max_depth": [
        10,
        15,
        20,
        40,
    ],  # тут классов где-то 1300, т.е. log_2(1300) ~ 10 - глубина бинарного дерева с ~1400 листьями
    "min_samples_leaf": [80, 60, 40, 20],
}

clf = GridSearchCV(
    DecisionTreeClassifier(random_state=42), param_grid=params, cv=cv, verbose=2
)
clf.fit(X_train, y_train)

Fitting 2 folds for each of 32 candidates, totalling 64 fits
[CV] END ..criterion=gini, max_depth=10, min_samples_leaf=80; total time=   3.7s
[CV] END ..criterion=gini, max_depth=10, min_samples_leaf=80; total time=   3.7s
[CV] END ..criterion=gini, max_depth=10, min_samples_leaf=60; total time=   3.9s
[CV] END ..criterion=gini, max_depth=10, min_samples_leaf=60; total time=   3.8s
[CV] END ..criterion=gini, max_depth=10, min_samples_leaf=40; total time=   4.0s
[CV] END ..criterion=gini, max_depth=10, min_samples_leaf=40; total time=   3.8s
[CV] END ..criterion=gini, max_depth=10, min_samples_leaf=20; total time=   3.9s
[CV] END ..criterion=gini, max_depth=10, min_samples_leaf=20; total time=   3.8s
[CV] END ..criterion=gini, max_depth=15, min_samples_leaf=80; total time=   4.7s
[CV] END ..criterion=gini, max_depth=15, min_samples_leaf=80; total time=   4.9s
[CV] END ..criterion=gini, max_depth=15, min_samples_leaf=60; total time=   5.2s
[CV] END ..criterion=gini, max_depth=15, min_sam

In [79]:
clf.best_params_

{'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 40}

In [99]:
# TODO заплотить
clf.best_estimator_.feature_importances_

array([0.01540227, 0.        , 0.        , 0.        , 0.        ,
       0.00071213, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00640259, 0.        , 0.00322458,
       0.00106585, 0.0024468 , 0.06833077, 0.09592321, 0.08902138,
       0.09163623, 0.06499305, 0.08232324, 0.08790109, 0.01995919,
       0.09963679, 0.08038668, 0.02414214, 0.073443  , 0.0306342 ,
       0.02378639, 0.02038576, 0.01824264])

In [80]:
# Сохраняем модель
with open("data/dt_clf_subgenres_only.skops", "wb") as model_f:
    sio.dump(clf.best_estimator_, model_f)

#### Проверка качества

In [81]:
X_test = preproc.transform(X_test_df)

In [84]:
# TODO Переписать через гидра конфиг
# optimal_hypers = clf.best_params_
with open("data/dt_clf_subgenres_only.skops", "rb") as model_f:
    model = sio.load(model_f)

predictions_train = model.predict(X_train)
predictions_test = model.predict(X_test)

In [86]:
accuracy_train = accuracy_score(y_true=y_train, y_pred=predictions_train)
accuracy_test = accuracy_score(y_true=y_test, y_pred=predictions_test)

In [88]:
print(f"Accuracy на трейне: {accuracy_train}")
print(f"Accuracy на тесте: {accuracy_test}")

Accuracy на трейне: 0.05614958895276987
Accuracy на тесте: 0.03456078996091339
