In [37]:
import numpy as np
import pandas as pd
import skops.io as sio
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

In [7]:
np.random.seed(42)

In [8]:
DATA_PATH = "../../data/small_genres_tracks_df.pickle"

df_small_genres = pd.read_pickle(DATA_PATH)

### Разделить датасет на train, test

In [19]:
genre_counts = df_small_genres["genre"].value_counts()
df_majority = df_small_genres[
    df_small_genres["genre"].isin(genre_counts[genre_counts == 100].index)
].reset_index(drop=True)
features = [
    "track_duration_ms",
    "track_explicit",
    "danceability",
    "energy",
    "key",
    "loudness",
    "mode",
    "speechiness",
    "acousticness",
    "instrumentalness",
    "liveness",
    "valence",
    "tempo",
]
X, y = df_majority[features], df_majority["genre"]
X.loc[:, "track_explicit"] = X["track_explicit"].astype(int)
# в трейн идут 80 объектов каждого жанра, в тест - 20
train_df, test_df, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

train_df["genre"] = y_train
test_df["genre"] = y_test

train_df.to_csv("data/train_test_subgenres_only/train.csv", index_label="index")
test_df.to_csv("data/train_test_subgenres_only/test.csv", index_label="index")

### Подбор гиперпараметров

### Обучаем классификатор

#### Препроцессинг

In [5]:
class DataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.preprocessor = ColumnTransformer(
            transformers=[
                (
                    "categorical features encoder",
                    OneHotEncoder(drop="first"),
                    self.columns,
                ),
            ],
            remainder="passthrough",
        )

    def fit(self, X, y=None):
        self.preprocessor.fit(X)
        return self

    def transform(self, X, y=None):
        return self.preprocessor.transform(X)


CATEGORICAL_FEATURES = ["track_explicit", "key", "mode"]


def get_preprocessor(cat_features=CATEGORICAL_FEATURES):
    preproc = DataTransformer(columns=CATEGORICAL_FEATURES)
    return preproc

#### Обучение

In [28]:
# TODO Переписать через гидра конфиг
optimal_hypers = {"random_state": 42, "max_depth": 15, "min_samples_leaf": 40}


def train_model(train_df, hypers=optimal_hypers):
    X, y = train_df.drop("genre", axis=1), train_df["genre"].to_numpy()
    data_preproc = get_preprocessor()
    data_preproc.fit(X)
    X = data_preproc.transform(X)

    # training
    model = DecisionTreeClassifier(**hypers)
    model.fit(X, y)

    # saving the model
    with open("data/dt_clf_subgenres_only.skops", "wb") as model_f:
        sio.dump(model, model_f)

    return model


def infer(model, X_train, X_test):
    data_preproc = get_preprocessor()
    data_preproc.fit(X_train)

    X = data_preproc.transform(X_test)

    predictions = model.predict(X)
    return predictions

In [29]:
model = train_model(train_df)

In [31]:
predictions = infer(
    model, train_df.drop("genre", axis=1), test_df.drop("genre", axis=1)
)

In [38]:
accuracy = accuracy_score(y_pred=predictions, y_true=test_df["genre"])
accuracy

0.020075187969924812