In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

In [18]:
#Directorio.
directorio = r"C:\Users\Juan\Documents\Digital House\Git Digital House\DHDS_IV\midi_stats_v3_drop_cols_.csv"

In [19]:
#Se levanta el dataset.
data = pd.read_csv(directorio)

In [20]:
#Se eliminan los valores nulos.
data = data.fillna(value = 0)

In [21]:
#Se elimina registro que tiende a infinito.
data.drop(data[data.cant_pedales_seg > 9999999].index, inplace = True)

In [22]:
#Redondeo de floats.
for col in data.loc[:, data.dtypes == np.float64].columns:
    data[col] = data[col].apply(lambda x: round(x, 4))

In [23]:
#Visualización de columnas.
pd.options.display.max_columns = 20
#data.head(3)

In [24]:
#Matrices de features y target.
x = data.drop(columns = ["Genero", "tema", "Grupo", "info_tracks", "Unnamed: 0"], axis = 1)
y = data["Genero"]

In [49]:
#Se importan las clases.
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFECV
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.base import BaseEstimator, TransformerMixin

In [26]:
#Columnas categóricas / numéricas.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
non_categorical_columns = [col for col in x.columns if col not in categorical_columns]

In [27]:
#Código para el encoder.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
encoder_categories = []
for col in categorical_columns:    
    col_categories = x[col].unique()
    encoder_categories.append(col_categories)

In [28]:
#División del dataset.
X_train, X_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 1)

In [29]:
#Feature Engineering - columnas categóricas.
categorical_columns = [col for col in x.columns if x[col].dtypes == 'object']
encoder_categories = []
for col in categorical_columns:    
    col_categories = x[col].unique()
    encoder_categories.append(col_categories)
#encoder_categories

In [30]:
#Encoder.
encoder = OneHotEncoder(categories = encoder_categories, sparse=False)
encoder = encoder.fit(X_train[categorical_columns])
X_train_encoded = encoder.transform(X_train[categorical_columns])
X_train_categorical = pd.DataFrame(X_train_encoded, columns = encoder.get_feature_names(categorical_columns))
X_test_encoded = encoder.transform(X_test[categorical_columns])
X_test_categorical = pd.DataFrame(X_test_encoded, columns = encoder.get_feature_names(categorical_columns))
#X_train_categorical.head(3)
#X_test_categorical.head(3)

In [31]:
#Feature Engineering - columnas numéricas.
non_categorical_columns = [col for col in X_train.columns if col not in categorical_columns]
#non_categorical_columns

In [32]:
#Estandarización X_train.
std_sclr = StandardScaler()
std_sclr_trained = std_sclr.fit(X_train[non_categorical_columns])
X_train_numerical = std_sclr_trained.transform(X_train[non_categorical_columns])
X_train_numerical_scaled = pd.DataFrame(X_train_numerical, columns = non_categorical_columns)
#X_train_numerical_scaled.head()

In [33]:
#Estandarización X_test.
X_test_numerical = std_sclr_trained.transform(X_test[non_categorical_columns])
X_test_numerical_scaled = pd.DataFrame(X_test_numerical, columns = non_categorical_columns)
#X_test_numerical_scaled.head()

In [34]:
#Concatenación de dataframes de entrenamiento.
X_train_transf = pd.concat([X_train_categorical, X_train_numerical_scaled], axis = 1)
#print(X_train_categorical.shape)
#print(X_train_numerical_scaled.shape)
#print(X_train_transf.shape)

In [35]:
#Concatenación de dataframes de testeo.
X_test_transf = pd.concat([X_test_categorical, X_test_numerical_scaled], axis = 1)
#print(X_test_categorical.shape)
#print(X_test_numerical_scaled.shape)
#print(X_test_transf.shape)

In [36]:
#Se instancia el arbol de decisión.
clftree = DecisionTreeClassifier(random_state = 42, criterion = "entropy", max_depth = 8, min_samples_leaf = 15, min_samples_split = 2)

In [37]:
#Se instancia el bagging - árbol de ensamble.
bag_clf = BaggingClassifier(base_estimator = clftree, n_estimators = 1000,
                            bootstrap = True, n_jobs = -1,
                            random_state = 42)

In [39]:
#Se fitea el modelo.
bag_clf.fit(X_train_transf, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(criterion='entropy',
                                                        max_depth=8,
                                                        min_samples_leaf=15,
                                                        random_state=42),
                  n_estimators=1000, n_jobs=-1, random_state=42)

In [41]:
#Predicciones en train y test.
y_pred_train_transf = bag_clf.predict(X_train_transf)
y_pred_test_transf = bag_clf.predict(X_test_transf)

In [42]:
#Resultados del modelo.
print(classification_report(y_train, y_pred_train_transf))
print(accuracy_score(y_train, y_pred_train_transf))
print(classification_report(y_test, y_pred_test_transf))
print(accuracy_score(y_test, y_pred_test_transf))

              precision    recall  f1-score   support

 ClassicRock       0.86      0.98      0.92      1520
   Classical       0.93      0.99      0.96      1083
Dance_Techno       0.00      0.00      0.00        16
       Forro       0.00      0.00      0.00         5
       Hymns       0.96      0.99      0.98       242
        Jazz       0.99      0.93      0.96       235
     NEW-AGE       0.83      0.54      0.65       127
        Punk       1.00      0.45      0.62        58
      REGGAE       0.00      0.00      0.00        32
 Samba&Bossa       1.00      0.54      0.70        52
       TANGO       0.92      0.75      0.82        59
      latina       0.93      0.47      0.62       195

    accuracy                           0.90      3624
   macro avg       0.70      0.55      0.60      3624
weighted avg       0.89      0.90      0.89      3624

0.9036975717439294
              precision    recall  f1-score   support

 ClassicRock       0.83      0.94      0.88       507
   Cl

  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
#Matriz de confusión.
confusion_matrix(y_test, y_pred_test_transf)

array([[479,  18,   0,   0,   2,   2,   4,   0,   0,   1,   0,   1],
       [  1, 355,   0,   0,   0,   0,   4,   0,   0,   0,   1,   0],
       [  5,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  2,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0,   3,   0,   0,  77,   0,   0,   0,   0,   0,   0,   0],
       [  5,   1,   0,   0,   0,  72,   0,   0,   0,   0,   0,   0],
       [ 12,   7,   0,   0,   1,   0,  22,   0,   0,   0,   0,   0],
       [ 14,   0,   0,   0,   0,   0,   0,   6,   0,   0,   0,   0],
       [ 11,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  6,   1,   0,   0,   0,   0,   0,   0,   0,   8,   1,   1],
       [  2,   5,   0,   0,   0,   0,   1,   0,   0,   0,  12,   0],
       [ 42,   1,   0,   0,   0,   2,   1,   0,   0,   1,   0,  18]],
      dtype=int64)

In [55]:
#Análisis de features.
feature_importance = pd.DataFrame({'atributo':X_train_transf.columns, 'importancia':np.mean([tree.feature_importances_ for tree in bag_clf.estimators_], 
                                                                                            axis=0)}).sort_values('importancia', ascending = False)
feature_importance

Unnamed: 0,atributo,importancia
382,inst_Drum,0.408691
336,inst_Electric Piano,0.086279
337,"('Electric Piano', 0.0)",0.045388
522,"('ride', 0.938)",0.045296
245,"('Acoustic Bass', 0.0)",0.040230
...,...,...
110,"('Harpsicord', 0.375)",0.000000
109,"('Harpsicord', 0.5)",0.000000
387,"('Tom', 0.375)",0.000000
389,"('Tom', 0.625)",0.000000
