In [45]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

In [46]:
#Directorio.
directorio = r"C:\Users\Juan\Documents\Digital House\Git Digital House\DHDS_IV\midi_stats_v3_drop_cols_.csv"

In [47]:
#Se levanta el dataset.
data = pd.read_csv(directorio)

In [48]:
#Se eliminan los valores nulos.
data = data.fillna(value = 0)

In [49]:
#Se elimina registro que tiende a infinito.
data.drop(data[data.cant_pedales_seg > 9999999].index, inplace = True)

In [50]:
#Redondeo de floats.
for col in data.loc[:, data.dtypes == np.float64].columns:
    data[col] = data[col].apply(lambda x: round(x, 4))

In [51]:
#Visualización de columnas.
pd.options.display.max_columns = 20
#data.head(3)

In [52]:
#Matrices de features y target.
x = data.drop(columns = ["Genero", "tema", "Grupo", "info_tracks", "Unnamed: 0"], axis = 1)
y = data["Genero"]

In [53]:
#Se importan las clases.
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

In [54]:
#Se crea la clase para seleccionar las columnas para el procesamiento en los pipelines.
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(X[self.columns])
        else:
            raise TypeError("Este Transformador solo funciona en DF de Pandas")
    
    def fit(self, X, *_):
        return self

In [55]:
#Columnas categóricas / numéricas.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
non_categorical_columns = [col for col in x.columns if col not in categorical_columns]

In [56]:
#Código para el encoder.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
encoder_categories = []
for col in categorical_columns:    
    col_categories = x[col].unique()
    encoder_categories.append(col_categories)

In [57]:
#Pipe para las categóricas.
cs_categorical = ColumnSelector(categorical_columns)
cs_categorical.transform(data).head(3)
categorical_pipe = make_pipeline(ColumnSelector(categorical_columns), OneHotEncoder(categories = encoder_categories, sparse=False))

In [58]:
#Pipe para las numéricas.
cs_non_categorical = ColumnSelector(non_categorical_columns)
cs_non_categorical.transform(data).head(3)
non_categorical_pipe = make_pipeline(ColumnSelector(non_categorical_columns), StandardScaler())

In [59]:
#Unión de los pipes.
union = make_union(categorical_pipe, non_categorical_pipe)

In [60]:
#División del dataset.
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 1)

In [61]:
#Generación del pipeline.
pipeline = Pipeline([('union', union), ('clf', DecisionTreeClassifier(random_state = 42, criterion = "gini"))])

In [62]:
#Fiteo del pipeline.
pipeline.fit(X_train, y_train)

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['first_time_signature',
                                                                                          'tonalidad',
                                                                                          'tonalidad_escala'])),
                                                                 ('onehotencoder',
                                                                  OneHotEncoder(categories=[array(['3/4', '6/8', '4/4', '3/2', '9/8', '2/4', '12/8', '2/2', '6/4',
       '2/8', '1/4', '1/8', '3/8', '8/4', '9/16', '4/2', '1/2', '12/16',
       '1/...
                                                                                          'cant_eventos_por_pedal',
                                                

In [63]:
#Parámetros para el cross-validation.
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)

param_grid = { "clf__criterion" : ["gini", "entropy"],
                "clf__min_samples_leaf": [5,10,15,20,None], 
                "clf__max_depth" : [1,2,3,4,5,6,8,9,10,11,12,13,14,None],
                "clf__min_samples_split": [2, 3, 4,None]}

In [64]:
#Se genera el gridsearch.
grid_search = GridSearchCV(pipeline, param_grid, cv = folds, n_jobs = -1)

In [79]:
#Se fitea el gridsearch.
grid_search.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('union',
                                        FeatureUnion(transformer_list=[('pipeline-1',
                                                                        Pipeline(steps=[('columnselector',
                                                                                         ColumnSelector(columns=['first_time_signature',
                                                                                                                 'tonalidad',
                                                                                                                 'tonalidad_escala'])),
                                                                                        ('onehotencoder',
                                                                                         OneHotEncoder(categories=[array(['3/4', '6/8', '4/4', '3/2', '9/8', '2...
              

In [80]:
#Mejor score del gridsearch.
grid_search.best_score_

0.859271523178808

In [81]:
#Mejores parámetros del gridsearch.
grid_search.best_params_

{'clf__criterion': 'entropy',
 'clf__max_depth': 8,
 'clf__min_samples_leaf': 15,
 'clf__min_samples_split': 2}

In [82]:
#Mejor estimador dentro de los parámetros que se pasaron.
grid_search.best_estimator_

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['first_time_signature',
                                                                                          'tonalidad',
                                                                                          'tonalidad_escala'])),
                                                                 ('onehotencoder',
                                                                  OneHotEncoder(categories=[array(['3/4', '6/8', '4/4', '3/2', '9/8', '2/4', '12/8', '2/2', '6/4',
       '2/8', '1/4', '1/8', '3/8', '8/4', '9/16', '4/2', '1/2', '12/16',
       '1/...
                                                                                          'avg_simult_Piano',
                                                      

In [83]:
#Predicción del gridsearch en train.
y_pred_train = grid_search.best_estimator_.predict(X_train)

In [84]:
#Predicción del gridsearch en test.
y_pred_test = grid_search.best_estimator_.predict(X_test)

In [85]:
#Resultados del modelo.
print(classification_report(y_train, y_pred_train))
print(accuracy_score(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))
print(accuracy_score(y_test, y_pred_test))

              precision    recall  f1-score   support

 ClassicRock       0.88      0.94      0.91      1520
   Classical       0.95      0.98      0.96      1083
Dance_Techno       0.00      0.00      0.00        16
       Forro       0.00      0.00      0.00         5
       Hymns       1.00      0.98      0.99       242
        Jazz       0.94      0.93      0.94       235
     NEW-AGE       0.63      0.60      0.61       127
        Punk       0.78      0.50      0.61        58
      REGGAE       0.50      0.25      0.33        32
 Samba&Bossa       0.93      0.48      0.63        52
       TANGO       0.91      0.73      0.81        59
      latina       0.66      0.56      0.61       195

    accuracy                           0.89      3624
   macro avg       0.68      0.58      0.62      3624
weighted avg       0.88      0.89      0.88      3624

0.891832229580574
              precision    recall  f1-score   support

 ClassicRock       0.84      0.89      0.87       507
   Cla

  _warn_prf(average, modifier, msg_start, len(result))


In [86]:
#Matriz de confusión.
confusion_matrix(y_test, y_pred_test)

array([[450,   9,   0,   0,   1,   3,  13,  10,   1,   1,   0,  19],
       [  3, 349,   0,   0,   0,   0,   7,   0,   0,   0,   1,   1],
       [  4,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1],
       [  1,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0],
       [  2,   2,   0,   0,  75,   0,   1,   0,   0,   0,   0,   0],
       [  7,   0,   0,   0,   0,  71,   0,   0,   0,   0,   0,   0],
       [ 10,   4,   0,   0,   0,   0,  25,   0,   1,   0,   0,   2],
       [ 13,   0,   0,   0,   0,   0,   0,   7,   0,   0,   0,   0],
       [  4,   0,   0,   0,   0,   0,   3,   0,   3,   0,   0,   1],
       [  6,   1,   0,   0,   0,   0,   1,   0,   0,   6,   1,   2],
       [  2,   5,   0,   0,   0,   0,   0,   0,   0,   0,  12,   1],
       [ 31,   1,   0,   0,   0,   2,   2,   0,   3,   0,   0,  26]],
      dtype=int64)

In [87]:
#Feature Engineering - columnas categóricas.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
encoder_categories = []
for col in categorical_columns:    
    col_categories = x[col].unique()
    encoder_categories.append(col_categories)
#encoder_categories

In [66]:
#Encoder.
encoder = OneHotEncoder(categories = encoder_categories, sparse=False)
encoder = encoder.fit(X_train[categorical_columns])
X_train_encoded = encoder.transform(X_train[categorical_columns])
X_train_categorical = pd.DataFrame(X_train_encoded, columns = encoder.get_feature_names(categorical_columns))
X_test_encoded = encoder.transform(X_test[categorical_columns])
X_test_categorical = pd.DataFrame(X_test_encoded, columns = encoder.get_feature_names(categorical_columns))
#X_train_categorical.head(3)
#X_test_categorical.head(3)

In [67]:
#Feature Engineering - columnas numéricas.
non_categorical_columns = [col for col in X_train.columns if col not in categorical_columns]
#non_categorical_columns

In [68]:
#Estandarización X_train.
std_sclr = StandardScaler()
std_sclr_trained = std_sclr.fit(X_train[non_categorical_columns])
X_train_numerical = std_sclr_trained.transform(X_train[non_categorical_columns])
X_train_numerical_scaled = pd.DataFrame(X_train_numerical, columns = non_categorical_columns)
#X_train_numerical_scaled.head()

In [69]:
#Estandarización X_test.
X_test_numerical = std_sclr_trained.transform(X_test[non_categorical_columns])
X_test_numerical_scaled = pd.DataFrame(X_test_numerical, columns = non_categorical_columns)
#X_test_numerical_scaled.head()

In [70]:
#Concatenación de dataframes de entrenamiento.
X_train_transf = pd.concat([X_train_categorical, X_train_numerical_scaled], axis = 1)
#print(X_train_categorical.shape)
#print(X_train_numerical_scaled.shape)
#print(X_train_transf.shape)

In [71]:
#Concatenación de dataframes de testeo.
X_test_transf = pd.concat([X_test_categorical, X_test_numerical_scaled], axis = 1)
#print(X_test_categorical.shape)
#print(X_test_numerical_scaled.shape)
#print(X_test_transf.shape)

In [88]:
#Se instancia el modelo.
model = DecisionTreeClassifier(random_state = 42, criterion = "entropy", max_depth = 8, min_samples_leaf = 15, min_samples_split = 2)

In [89]:
#Se fitea el modelo.
model.fit(X_train_transf, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=8, min_samples_leaf=15,
                       random_state=42)

In [90]:
#Predicciones en train y test.
y_pred_train_transf = model.predict(X_train_transf)
y_pred_test_transf = model.predict(X_test_transf)

In [91]:
#Resultados del modelo.
print(classification_report(y_train, y_pred_train_transf))
print(accuracy_score(y_train, y_pred_train_transf))
print(classification_report(y_test, y_pred_test_transf))
print(accuracy_score(y_test, y_pred_test_transf))

              precision    recall  f1-score   support

 ClassicRock       0.88      0.94      0.91      1520
   Classical       0.95      0.98      0.96      1083
Dance_Techno       0.00      0.00      0.00        16
       Forro       0.00      0.00      0.00         5
       Hymns       1.00      0.98      0.99       242
        Jazz       0.94      0.93      0.94       235
     NEW-AGE       0.63      0.60      0.61       127
        Punk       0.78      0.50      0.61        58
      REGGAE       0.50      0.25      0.33        32
 Samba&Bossa       0.93      0.48      0.63        52
       TANGO       0.91      0.73      0.81        59
      latina       0.66      0.56      0.61       195

    accuracy                           0.89      3624
   macro avg       0.68      0.58      0.62      3624
weighted avg       0.88      0.89      0.88      3624

0.891832229580574
              precision    recall  f1-score   support

 ClassicRock       0.84      0.89      0.87       507
   Cla

  _warn_prf(average, modifier, msg_start, len(result))


In [92]:
#Matriz de confusión.
confusion_matrix(y_test, y_pred_test_transf)

array([[450,   9,   0,   0,   1,   3,  13,  10,   1,   1,   0,  19],
       [  3, 349,   0,   0,   0,   0,   7,   0,   0,   0,   1,   1],
       [  4,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1],
       [  1,   0,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0],
       [  2,   2,   0,   0,  75,   0,   1,   0,   0,   0,   0,   0],
       [  7,   0,   0,   0,   0,  71,   0,   0,   0,   0,   0,   0],
       [ 10,   4,   0,   0,   0,   0,  25,   0,   1,   0,   0,   2],
       [ 13,   0,   0,   0,   0,   0,   0,   7,   0,   0,   0,   0],
       [  4,   0,   0,   0,   0,   0,   3,   0,   3,   0,   0,   1],
       [  6,   1,   0,   0,   0,   0,   1,   0,   0,   6,   1,   2],
       [  2,   5,   0,   0,   0,   0,   0,   0,   0,   0,  12,   1],
       [ 31,   1,   0,   0,   0,   2,   2,   0,   3,   0,   0,  26]],
      dtype=int64)

In [96]:
#Análisis de las features. 
feature_importance = pd.DataFrame({'atributo':X_train_transf.columns, 'importancia':model.feature_importances_}).sort_values('importancia', ascending = False)
feature_importance

Unnamed: 0,atributo,importancia
382,inst_Drum,0.407011
336,inst_Electric Piano,0.130613
245,"('Acoustic Bass', 0.0)",0.073501
233,length_note_1.125,0.047181
522,"('ride', 0.938)",0.031301
...,...,...
212,chord_2_4_7m,0.000000
211,"('Effects', 0.0)",0.000000
210,inst_Effects,0.000000
209,avg_simult_Synth Effects,0.000000
