### RandomForest

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("../data/processed/processed_no_dummies.csv", index_col=0)

In [3]:
df.head(5)

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,...,key,popularity,explicit,version_type_encoded,decade,energy_danceability_valence,acoustic_intensity,popularity_energy_ratio,valence_energy_dif,decade_range
0,0gNNToCW3qjabgTyBSjt3H,!Que Vida! - Mono Version,['Love'],220560,11/1/66,1966,0.525,0.6,0.54,0.00305,...,9,26,0,1,1960,0.177228,-6.196575,48.147257,0.007,50s-60s
1,0tMgFpOrXZR6irEOLNWwJL,"""40""",['U2'],157840,2/28/83,1983,0.228,0.368,0.48,0.707,...,8,21,0,1,1980,0.059704,-2.64594,43.749089,-0.142,70s-80s
2,2ZywW3VyVx6rrlrX75n3JB,"""40"" - Live",['U2'],226200,8/20/83,1983,0.0998,0.272,0.684,0.0145,...,8,41,0,0,1980,0.051907,-0.970854,59.940644,-0.405,70s-80s
3,6DdWA7D1o5TU2kXWyCLcch,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.185,0.371,0.545,0.582,...,8,37,0,2,1980,0.06268,-1.723275,67.888663,-0.235,70s-80s
4,3vMmwsAiLDCfyc1jl76lQE,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.185,0.371,0.545,0.582,...,8,35,0,2,1980,0.06268,-1.723275,64.219005,-0.235,70s-80s


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 139848 entries, 0 to 169906
Data columns (total 26 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           139848 non-null  object 
 1   name                         139848 non-null  object 
 2   artists                      139848 non-null  object 
 3   duration_ms                  139848 non-null  int64  
 4   release_date                 139848 non-null  object 
 5   year                         139848 non-null  int64  
 6   acousticness                 139848 non-null  float64
 7   danceability                 139848 non-null  float64
 8   energy                       139848 non-null  float64
 9   instrumentalness             139848 non-null  float64
 10  liveness                     139848 non-null  float64
 11  loudness                     139848 non-null  float64
 12  speechiness                  139848 non-null  float64
 13  temp

In [5]:
df = df[(df['decade'] >= 1950) & (df['decade'] < 2010)]

In [6]:
df['popularity_normalized'] = (df['popularity'] - df.groupby('decade')['popularity'].transform('mean')) / df.groupby('decade')['popularity'].transform('std')

In [7]:
df.head(5)

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,...,popularity,explicit,version_type_encoded,decade,energy_danceability_valence,acoustic_intensity,popularity_energy_ratio,valence_energy_dif,decade_range,popularity_normalized
0,0gNNToCW3qjabgTyBSjt3H,!Que Vida! - Mono Version,['Love'],220560,11/1/66,1966,0.525,0.6,0.54,0.00305,...,26,0,1,1960,0.177228,-6.196575,48.147257,0.007,50s-60s,0.030105
1,0tMgFpOrXZR6irEOLNWwJL,"""40""",['U2'],157840,2/28/83,1983,0.228,0.368,0.48,0.707,...,21,0,1,1980,0.059704,-2.64594,43.749089,-0.142,70s-80s,-1.498349
2,2ZywW3VyVx6rrlrX75n3JB,"""40"" - Live",['U2'],226200,8/20/83,1983,0.0998,0.272,0.684,0.0145,...,41,0,0,1980,0.051907,-0.970854,59.940644,-0.405,70s-80s,0.374453
3,6DdWA7D1o5TU2kXWyCLcch,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.185,0.371,0.545,0.582,...,37,0,2,1980,0.06268,-1.723275,67.888663,-0.235,70s-80s,-0.000108
4,3vMmwsAiLDCfyc1jl76lQE,"""40"" - Remastered 2008",['U2'],157667,2/28/83,1983,0.185,0.371,0.545,0.582,...,35,0,2,1980,0.06268,-1.723275,64.219005,-0.235,70s-80s,-0.187388


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 119948 entries, 0 to 169906
Data columns (total 27 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           119948 non-null  object 
 1   name                         119948 non-null  object 
 2   artists                      119948 non-null  object 
 3   duration_ms                  119948 non-null  int64  
 4   release_date                 119948 non-null  object 
 5   year                         119948 non-null  int64  
 6   acousticness                 119948 non-null  float64
 7   danceability                 119948 non-null  float64
 8   energy                       119948 non-null  float64
 9   instrumentalness             119948 non-null  float64
 10  liveness                     119948 non-null  float64
 11  loudness                     119948 non-null  float64
 12  speechiness                  119948 non-null  float64
 13  temp

In [9]:
df.columns

Index(['id', 'name', 'artists', 'duration_ms', 'release_date', 'year',
       'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit', 'version_type_encoded', 'decade',
       'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif', 'decade_range',
       'popularity_normalized'],
      dtype='object')

## RandomForestClassifier

### Baseline: todas las variables

In [22]:
X = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'explicit', 'popularity_energy_ratio', 'popularity_normalized']]
y = df['decade_range']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

print(X.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(119948, 15)
(23989, 15)
(95959, 15)
(23989,)
(95959,)


In [24]:
model_1 = RandomForestClassifier(random_state=42)
model_1.fit(X_train, y_train)

In [25]:
y_pred_1 = model_1.predict(X_test)
accuracy_1 = accuracy_score(y_test, y_pred_1)
precision_1 = precision_score(y_test, y_pred_1, average='macro')
recall_1 = recall_score(y_test, y_pred_1, average='macro')
f1_1 = f1_score(y_test, y_pred_1, average='macro')

print("Accuracy del modelo RandomForest:", accuracy_1)
print("Precisión (Precision):", precision_1)
print("Sensibilidad (Recall):", recall_1)
print("F1-Score:", f1_1)

Accuracy del modelo RandomForest: 0.8428599714461384
Precisión (Precision): 0.8437161911137361
Sensibilidad (Recall): 0.8428644090990433
F1-Score: 0.8429865281317572


### RandomForest 2

In [62]:
X_2 = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity_normalized', 'explicit']]
y = df['decade_range']

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y, train_size=0.2, random_state=42)
model_2 = RandomForestClassifier(random_state=42)
model_2.fit(X_train, y_train)

print(X_2.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(119948, 14)
(23989, 14)
(95959, 14)
(23989,)
(95959,)


In [64]:
y_pred_2 = model_2.predict(X_test)
accuracy_2 = accuracy_score(y_test, y_pred_2)
precision_2 = precision_score(y_test, y_pred_2, average='macro')
recall_2 = recall_score(y_test, y_pred_2, average='macro')
f1_2 = f1_score(y_test, y_pred_2, average='macro')

print("Accuracy del modelo RandomForest 2:", accuracy_2)
print("Precisión (Precision):", precision_2)
print("Sensibilidad (Recall):", recall_2)
print("F1-Score:", f1_2)

Accuracy del modelo RandomForest 2: 0.6912952406757052
Precisión (Precision): 0.6907186833431335
Sensibilidad (Recall): 0.6913092441969292
F1-Score: 0.6881024598627129


### RandomForest 3

In [19]:
X_3 = df[['duration_ms', 'instrumentalness',
       'liveness', 'speechiness', 'tempo', 'mode',
       'key', 'explicit', 'version_type_encoded',
       'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif']]
y = df['decade_range']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y, train_size=0.2, random_state=42)
model_3 = RandomForestClassifier(random_state=42)
model_3.fit(X_train, y_train)

print(X_3.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(119948, 13)
(23989, 13)
(95959, 13)
(23989,)
(95959,)


In [21]:
y_pred_3 = model_3.predict(X_test)
accuracy_3 = accuracy_score(y_test, y_pred_3)
precision_3 = precision_score(y_test, y_pred_3, average='macro')
recall_3 = recall_score(y_test, y_pred_3, average='macro')
f1_3 = f1_score(y_test, y_pred_3, average='macro')

print("Accuracy del modelo RandomForest 3:", accuracy_3)
print("Precisión (Precision):", precision_3)
print("Sensibilidad (Recall):", recall_3)
print("F1-Score:", f1_3)

Accuracy del modelo RandomForest 3: 0.7160141310351296
Precisión (Precision): 0.7140278246212115
Sensibilidad (Recall): 0.7160112949069176
F1-Score: 0.7147654262363862


### DecisionTree 4

In [22]:
X_4 = df[['duration_ms', 'acousticness',
       'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif']]
y = df['decade_range']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y, train_size=0.2, random_state=42)
model_4 = RandomForestClassifier(random_state=42)
model_4.fit(X_train, y_train)

print(X.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(119948, 19)
(23989, 6)
(95959, 6)
(23989,)
(95959,)


In [24]:
y_pred_4 = model_4.predict(X_test)
accuracy_4 = accuracy_score(y_test, y_pred_4)
precision_4 = precision_score(y_test, y_pred_4, average='macro')
recall_4 = recall_score(y_test, y_pred_4, average='macro')
f1_4 = f1_score(y_test, y_pred_4, average='macro')

print("Accuracy del modelo Decision Tree 2:", accuracy_4)
print("Precisión (Precision):", precision_4)
print("Sensibilidad (Recall):", recall_4)
print("F1-Score:", f1_4)

Accuracy del modelo Decision Tree 2: 0.7120332642065882
Precisión (Precision): 0.710831272709772
Sensibilidad (Recall): 0.7120229485106934
F1-Score: 0.7112783595808335


### RandomForest 5: SelecKBest

In [25]:
X_5 = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit', 'version_type_encoded',
       'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif']]
y = df['decade_range']

In [26]:
k_values = range(1, X_5.shape[1] + 1)

In [39]:
k = 15
selector = SelectKBest(score_func=f_classif, k=k)
X_new = selector.fit_transform(X_5, y)
selected_features = X_5.columns[selector.get_support()]

In [40]:
print(f"\nCaracterísticas seleccionadas con SelectKBest (top {k}):")
print(selected_features)


Características seleccionadas con SelectKBest (top 15):
Index(['duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'explicit', 'energy_danceability_valence',
       'acoustic_intensity', 'valence_energy_dif'],
      dtype='object')


In [41]:
X_k = df[['duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'popularity', 'explicit', 'energy_danceability_valence',
       'acoustic_intensity', 'valence_energy_dif']]
y = df['decade_range']

In [42]:
X_train_k, X_test_k, y_train, y_test = train_test_split(X_k, y, train_size=0.2, random_state=42)
model_5 = RandomForestClassifier(random_state=42)
model_5.fit(X_train_k, y_train)

In [43]:
y_pred_5 = model_5.predict(X_test_k)
accuracy_5 = accuracy_score(y_test, y_pred_5)
precision_5 = precision_score(y_test, y_pred_5, average='macro')
recall_5 = recall_score(y_test, y_pred_5, average='macro')
f1_5 = f1_score(y_test, y_pred_5, average='macro')

print("Accuracy del modelo Decision Tree:", accuracy_5)
print("Precisión (Precision):", precision_5)
print("Sensibilidad (Recall):", recall_5)
print("F1-Score:", f1_5)

Accuracy del modelo Decision Tree: 0.753707312498046
Precisión (Precision): 0.7562596030998856
Sensibilidad (Recall): 0.753696958017783
F1-Score: 0.7546291511580875


### RandomForest 6: Hiperparametrización

In [47]:
X_6 = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit']]
y = df['decade_range']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y, train_size=0.2, random_state=42)
model_2 = RandomForestClassifier(random_state=42)
model_2.fit(X_train, y_train)

print(X_6.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(119948, 14)
(23989, 14)
(95959, 14)
(23989,)
(95959,)


In [49]:
param_grid = {
    "n_estimators" : [20, 50, 100],
    "max_depth": np.arange(3,8),
    "max_features": np.arange(2,5),
    "max_leaf_nodes": [10,15,20]}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=20; total time=   0.1s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=20; total time=   0.2s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=20; total time=   0.3s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=20; total time=   0.2s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=20; total time=   0.2s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=50; total time=   0.6s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=50; total time=   0.7s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=50; total time=   0.7s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=50; total time=   0.7s
[CV] END max_depth=3, max_features=2, max_leaf_nodes=10, n_estimators=50; total time=   0.7s
[CV] EN

  _data = np.array(data, dtype=dtype, copy=copy,


In [50]:
best_model = grid_search.best_estimator_
y_pred_6 = best_model.predict(X_test)
accuracy_6 = accuracy_score(y_test, y_pred_6)
precision_6 = precision_score(y_test, y_pred_6, average='macro')
recall_6 = recall_score(y_test, y_pred_6, average='macro')
f1_6 = f1_score(y_test, y_pred_6, average='macro')

print("Accuracy del modelo RandomForest:", accuracy_6)
print("Precisión (Precision):", precision_6)
print("Sensibilidad (Recall):", recall_6)
print("F1-Score:", f1_6)

Accuracy del modelo RandomForest: 0.7315207536552069
Precisión (Precision): 0.7331739713291262
Sensibilidad (Recall): 0.7315123471728437
F1-Score: 0.7322094320075087
