In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

In [2]:
#Directorio.
directorio = r"C:\Users\Juan\Documents\Digital House\Git Digital House\DHDS_IV\midi_stats_v3_drop_cols_.csv"

In [3]:
#Se levanta el dataset.
data = pd.read_csv(directorio)

In [4]:
#Se eliminan los valores nulos.
data = data.fillna(value = 0)

In [5]:
#Se elimina registro que tiende a infinito.
data.drop(data[data.cant_pedales_seg > 9999999].index, inplace = True)

In [6]:
#Redondeo de floats.
for col in data.loc[:, data.dtypes == np.float64].columns:
    data[col] = data[col].apply(lambda x: round(x, 4))

In [7]:
#Visualización de columnas.
pd.options.display.max_columns = 20
#data.head(3)

In [8]:
#Matrices de features y target.
x = data.drop(columns = ["Genero", "tema", "Grupo"], axis = 1)
y = data["Genero"]

In [9]:
#Se importan las clases.
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

In [10]:
#Se crea la clase para seleccionar las columnas para el procesamiento en los pipelines.
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def transform(self, X, *_):
        if isinstance(X, pd.DataFrame):
            return pd.DataFrame(X[self.columns])
        else:
            raise TypeError("Este Transformador solo funciona en DF de Pandas")
    
    def fit(self, X, *_):
        return self

In [11]:
#Columnas categóricas / numéricas.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
non_categorical_columns = [col for col in x.columns if col not in categorical_columns]

In [12]:
#Código para el encoder.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
encoder_categories = []
for col in categorical_columns:    
    col_categories = x[col].unique()
    encoder_categories.append(col_categories)

In [13]:
#Pipe para las categóricas.
cs_categorical = ColumnSelector(categorical_columns)
cs_categorical.transform(data).head(3)
categorical_pipe = make_pipeline(ColumnSelector(categorical_columns), OneHotEncoder(categories = encoder_categories, sparse=False))

In [14]:
#Pipe para las numéricas.
cs_non_categorical = ColumnSelector(non_categorical_columns)
cs_non_categorical.transform(data).head(3)
non_categorical_pipe = make_pipeline(ColumnSelector(non_categorical_columns), StandardScaler())

In [15]:
#Unión de los pipes.
union = make_union(categorical_pipe, non_categorical_pipe)

In [16]:
#División del dataset.
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 1)

In [17]:
#Generación del pipeline.
pipeline = Pipeline([('union', union), ('clf', DecisionTreeClassifier(random_state = 42, criterion = "gini"))])

In [18]:
#Fiteo del pipeline.
pipeline.fit(X_train, y_train)

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['first_time_signature',
                                                                                          'tonalidad',
                                                                                          'tonalidad_escala',
                                                                                          'info_tracks'])),
                                                                 ('onehotencoder',
                                                                  OneHotEncoder(categories=[array(['3/4', '6/8', '4/4', '3/2', '9/8', '2/4', '12/8', '2/2', '6/4',
       '2/8', '1/4', '1/8', '3/8', '8/4', '9/16', '4/2', '1/...
                                                                                  

In [24]:
#Parámetros para el cross-validation.
folds = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)

param_grid = { "clf__criterion" : ["gini", "entropy"],
                "clf__min_samples_leaf": [5,10,15,20,None], 
                "clf__max_depth" : [1,2,3,4,5,6,8,9,10,11,12,13,14,None],
                "clf__min_samples_split": [2, 3, 4,None]}

In [25]:
#Se genera el gridsearch.
grid_search = GridSearchCV(pipeline, param_grid, cv = folds, n_jobs = -1)

In [26]:
#Se fitea el gridsearch.
grid_search.fit(X_train, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('union',
                                        FeatureUnion(transformer_list=[('pipeline-1',
                                                                        Pipeline(steps=[('columnselector',
                                                                                         ColumnSelector(columns=['first_time_signature',
                                                                                                                 'tonalidad',
                                                                                                                 'tonalidad_escala',
                                                                                                                 'info_tracks'])),
                                                                                        ('onehotencoder',
                                                 

In [27]:
#Mejor score del gridsearch.
grid_search.best_score_

0.9955849889624724

In [28]:
#Mejores parámetros del gridsearch.
grid_search.best_params_

{'clf__criterion': 'gini',
 'clf__max_depth': 9,
 'clf__min_samples_leaf': 5,
 'clf__min_samples_split': 2}

In [29]:
#Mejor estimador dentro de los parámetros que se pasaron.
grid_search.best_estimator_

Pipeline(steps=[('union',
                 FeatureUnion(transformer_list=[('pipeline-1',
                                                 Pipeline(steps=[('columnselector',
                                                                  ColumnSelector(columns=['first_time_signature',
                                                                                          'tonalidad',
                                                                                          'tonalidad_escala',
                                                                                          'info_tracks'])),
                                                                 ('onehotencoder',
                                                                  OneHotEncoder(categories=[array(['3/4', '6/8', '4/4', '3/2', '9/8', '2/4', '12/8', '2/2', '6/4',
       '2/8', '1/4', '1/8', '3/8', '8/4', '9/16', '4/2', '1/...
                                                                                  

In [30]:
#Predicción del gridsearch en train.
y_pred_train = grid_search.best_estimator_.predict(X_train)

In [31]:
#Predicción del gridsearch en test.
y_pred_test = grid_search.best_estimator_.predict(X_test)

In [32]:
#Resultados del modelo.
print(classification_report(y_train, y_pred_train))
print(accuracy_score(y_train, y_pred_train))
print(classification_report(y_test, y_pred_test))
print(accuracy_score(y_test, y_pred_test))

              precision    recall  f1-score   support

 ClassicRock       1.00      1.00      1.00      1520
   Classical       1.00      1.00      1.00      1083
Dance_Techno       1.00      1.00      1.00        16
       Forro       1.00      1.00      1.00         5
       Hymns       1.00      1.00      1.00       242
        Jazz       1.00      1.00      1.00       235
     NEW-AGE       1.00      1.00      1.00       127
        Punk       1.00      1.00      1.00        58
      REGGAE       1.00      1.00      1.00        32
 Samba&Bossa       1.00      1.00      1.00        52
       TANGO       1.00      1.00      1.00        59
      latina       1.00      1.00      1.00       195

    accuracy                           1.00      3624
   macro avg       1.00      1.00      1.00      3624
weighted avg       1.00      1.00      1.00      3624

1.0
              precision    recall  f1-score   support

 ClassicRock       1.00      1.00      1.00       507
   Classical       1

In [33]:
confusion_matrix(y_test, y_pred_test)

array([[506,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0, 361,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   4,   0,   1,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   2,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,  80,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,  78,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,  42,   0,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  20,   0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,  10,   1,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  17,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  20,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  65]],
      dtype=int64)

In [35]:
model = DecisionTreeClassifier(random_state = 42, criterion = "gini", max_depth = 9, min_samples_leaf = 5, min_samples_split = 2)

In [36]:
model.fit(X_train, y_train)

ValueError: could not convert string to float: '4/4'

In [34]:
pd.DataFrame({'atributo':x.columns, 'importancia':grid_search.best_estimator_.feature_importances_}).sort_values('importancia', ascending = False)

AttributeError: 'Pipeline' object has no attribute 'feature_importances_'