In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

In [2]:
#Directorio.
directorio = r"C:\Users\Juan\Documents\Digital House\Git Digital House\DHDS_IV\midi_stats_v3_drop_cols_.csv"

In [3]:
#Se levanta el dataset.
data = pd.read_csv(directorio)

In [4]:
#Se eliminan los valores nulos.
data = data.fillna(value = 0)

In [5]:
#Se elimina registro que tiende a infinito.
data.drop(data[data.cant_pedales_seg > 9999999].index, inplace = True)

In [6]:
#Redondeo de floats.
for col in data.loc[:, data.dtypes == np.float64].columns:
    data[col] = data[col].apply(lambda x: round(x, 4))

In [7]:
#Visualización de columnas.
pd.options.display.max_columns = 20
#data.head(3)

In [8]:
#Matrices de features y target.
x = data.drop(columns = ["Genero", "tema", "Grupo", "info_tracks", "Unnamed: 0"], axis = 1)
y = data["Genero"]

In [9]:
#Se importan las clases.
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

In [10]:
#Se crea la clase para seleccionar las columnas para el procesamiento en los pipelines.
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(X[self.columns])
        else:
            raise TypeError("Este Transformador solo funciona en DF de Pandas")
    
    def fit(self, X, *_):
        return self

In [11]:
#Columnas categóricas / numéricas.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
non_categorical_columns = [col for col in x.columns if col not in categorical_columns]

In [12]:
#Código para el encoder.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
encoder_categories = []
for col in categorical_columns:    
    col_categories = x[col].unique()
    encoder_categories.append(col_categories)

In [13]:
#Pipe para las categóricas.
cs_categorical = ColumnSelector(categorical_columns)
cs_categorical.transform(data).head(3)
categorical_pipe = make_pipeline(ColumnSelector(categorical_columns), OneHotEncoder(categories = encoder_categories, sparse=False))

In [14]:
#Pipe para las numéricas.
cs_non_categorical = ColumnSelector(non_categorical_columns)
cs_non_categorical.transform(data).head(3)
non_categorical_pipe = make_pipeline(ColumnSelector(non_categorical_columns), StandardScaler())

In [15]:
#Unión de los pipes.
union = make_union(categorical_pipe, non_categorical_pipe)

In [16]:
#División del dataset.
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 1)

In [17]:
#Generación del pipeline.
pipeline = Pipeline([('union', union), ('knn', KNeighborsClassifier(n_neighbors = 4))])

In [18]:
#Fiteo del pipeline.
pipeline.fit(X_train, y_train)

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['first_time_signature',
                                                                                          'tonalidad',
                                                                                          'tonalidad_escala'])),
                                                                 ('onehotencoder',
                                                                  OneHotEncoder(categories=[array(['3/4', '6/8', '4/4', '3/2', '9/8', '2/4', '12/8', '2/2', '6/4',
       '2/8', '1/4', '1/8', '3/8', '8/4', '9/16', '4/2', '1/2', '12/16',
       '1/...
                                                                                          'cant_eventos_por_pedal',
                                                

In [19]:
#Parámetros para el cross-validation.
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)

param_grid = [{'knn__n_neighbors': range(1, 5)}]

In [20]:
#Se genera el gridsearch.
grid_search = GridSearchCV(pipeline, param_grid, cv = folds, n_jobs = -1)

In [21]:
#Se fitea el gridsearch.
grid_search.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('union',
                                        FeatureUnion(transformer_list=[('pipeline-1',
                                                                        Pipeline(steps=[('columnselector',
                                                                                         ColumnSelector(columns=['first_time_signature',
                                                                                                                 'tonalidad',
                                                                                                                 'tonalidad_escala'])),
                                                                                        ('onehotencoder',
                                                                                         OneHotEncoder(categories=[array(['3/4', '6/8', '4/4', '3/2', '9/8', '2...
              

In [22]:
#Mejor score del gridsearch.
grid_search.best_score_

0.8148454746136865

In [23]:
#Mejor estimador dentro de los parámetros que se pasaron.
grid_search.best_estimator_

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['first_time_signature',
                                                                                          'tonalidad',
                                                                                          'tonalidad_escala'])),
                                                                 ('onehotencoder',
                                                                  OneHotEncoder(categories=[array(['3/4', '6/8', '4/4', '3/2', '9/8', '2/4', '12/8', '2/2', '6/4',
       '2/8', '1/4', '1/8', '3/8', '8/4', '9/16', '4/2', '1/2', '12/16',
       '1/...
                                                                                          'cant_eventos_por_pedal',
                                                

In [24]:
#Predicción del gridsearch con el mejor estimador.
y_pred = grid_search.best_estimator_.predict(X_test)

In [25]:
#Resultados del modelo.
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

 ClassicRock       0.82      0.86      0.84       507
   Classical       0.81      0.97      0.88       361
Dance_Techno       0.00      0.00      0.00         5
       Forro       1.00      0.50      0.67         2
       Hymns       0.92      0.99      0.95        80
        Jazz       0.84      0.97      0.90        78
     NEW-AGE       0.70      0.50      0.58        42
        Punk       1.00      0.15      0.26        20
      REGGAE       0.00      0.00      0.00        11
 Samba&Bossa       0.85      0.65      0.73        17
       TANGO       1.00      0.70      0.82        20
      latina       0.86      0.09      0.17        65

    accuracy                           0.82      1208
   macro avg       0.73      0.53      0.57      1208
weighted avg       0.82      0.82      0.80      1208

0.8245033112582781


  _warn_prf(average, modifier, msg_start, len(result))
