In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import label_binarize
from sklearn.feature_selection import SelectKBest, f_classif

In [9]:
df = pd.read_csv("../../data/train/train.csv")

In [10]:
df.head(5)

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,...,version_Remix,version_type_encoded,decade,decade_range,energy_danceability_valence,acoustic_intensity,popularity_energy_ratio,valence_energy_dif,decade_label,popularity_normalized
0,1xUJ1mdLAysxNBtO9w3J2g,Stompin At The Savoy,['Benny Goodman'],195533,1/1/90,1990,0.836,0.525,0.222,0.0799,...,0,1,1990,90s-00s,0.075175,-9.25034,225.21508,0.423,2,0.370632
1,62fX8EW16l8St2yL8rMer9,In My Room - Remastered,['The Beach Boys'],134133,9/16/63,1963,0.184,0.341,0.416,6.3e-05,...,0,2,1960,50s-60s,0.070786,-1.761616,129.804572,0.083,0,2.548527
2,0vTrXo9zLWib36Us8BQ4sN,State Of Independence - Single Version,['Jon & Vangelis'],262107,1/1/84,1984,0.148,0.752,0.652,0.0455,...,0,1,1980,70s-80s,0.428526,-1.94176,59.815034,0.222,1,0.272035
3,23IoCDFHoz1uM5XELOSg1U,Chandni Raaten Pyar Ki Baaten,"['Hemant Kumar', 'Lata Mangeshkar']",208973,12/1/52,1952,0.99,0.311,0.253,0.000156,...,0,1,1950,50s-60s,0.033204,-11.81466,43.476542,0.169,0,-0.478555
4,5ZrtPphrWFzzamKn3jSnv9,"Adagio for Strings, Op. 11","['Samuel Barber', 'Philadelphia Orchestra', 'E...",463347,1955,1955,0.91,0.0626,0.129,0.78,...,0,1,1950,50s-60s,0.000281,-19.53224,302.302147,-0.0942,0,1.492568


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95958 entries, 0 to 95957
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           95958 non-null  object 
 1   name                         95958 non-null  object 
 2   artists                      95958 non-null  object 
 3   duration_ms                  95958 non-null  int64  
 4   release_date                 95958 non-null  object 
 5   year                         95958 non-null  int64  
 6   acousticness                 95958 non-null  float64
 7   danceability                 95958 non-null  float64
 8   energy                       95958 non-null  float64
 9   instrumentalness             95958 non-null  float64
 10  liveness                     95958 non-null  float64
 11  loudness                     95958 non-null  float64
 12  speechiness                  95958 non-null  float64
 13  tempo           

In [12]:
df.columns

Index(['id', 'name', 'artists', 'duration_ms', 'release_date', 'year',
       'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit', 'artists_frequency', 'version_type',
       'base_name', 'version_Live', 'version_Original', 'version_Remaster',
       'version_Remix', 'version_type_encoded', 'decade', 'decade_range',
       'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif', 'decade_label',
       'popularity_normalized'],
      dtype='object')

### Todas las variables

In [13]:
X = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'version_type_encoded', 'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif',
       'popularity_normalized']]
y = df['decade_label']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

print(X.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(95958, 18)
(19191, 18)
(76767, 18)
(19191,)
(76767,)


In [17]:
model_1 = DecisionTreeClassifier(random_state=42)
model_1.fit(X_train, y_train)

In [20]:
y_pred_1 = model_1.predict(X_test)
accuracy_1 = accuracy_score(y_test, y_pred_1)
precision_1 = precision_score(y_test, y_pred_1, average='macro')
recall_1 = recall_score(y_test, y_pred_1, average='macro')
f1_1 = f1_score(y_test, y_pred_1, average='macro')

print("Accuracy:", accuracy_1)
print("Precisión (Precision):", precision_1)
print("Sensibilidad (Recall):", recall_1)
print("F1-Score:", f1_1)

Accuracy: 0.8999309599176729
Precisión (Precision): 0.8997918481956538
Sensibilidad (Recall): 0.8999029093148913
F1-Score: 0.8998446535779238


### Variables originales

In [27]:
X_2 = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity_normalized']]
y_2 = df['decade_label']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y, train_size=0.2, random_state=42)
model_2 = DecisionTreeClassifier(random_state=42)
model_2.fit(X_train, y_train)

print(X.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(95958, 18)
(19191, 13)
(76767, 13)
(19191,)
(76767,)


In [29]:
y_pred_2 = model_2.predict(X_test)
accuracy_2 = accuracy_score(y_test, y_pred_2)
precision_2 = precision_score(y_test, y_pred_2, average='macro')
recall_2 = recall_score(y_test, y_pred_2, average='macro')
f1_2 = f1_score(y_test, y_pred_2, average='macro')

print("Accuracy:", accuracy_2)
print("Precisión (Precision):", precision_2)
print("Sensibilidad (Recall):", recall_2)
print("F1-Score:", f1_2)

Accuracy: 0.6260632823999895
Precisión (Precision): 0.6266393432327227
Sensibilidad (Recall): 0.6260129233902666
F1-Score: 0.6263130486486196


### SelectKBest

In [30]:
X_3 = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'version_type_encoded', 'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif',
       'popularity_normalized']]
y_3 = df['decade_label']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_3, y_3, test_size=0.2, random_state=42)

In [80]:
k = 12
selector = SelectKBest(score_func=f_classif, k=k)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [81]:
selected_features = X_3.columns[selector.get_support()]
print(f"Características seleccionadas (top {k}):")
print(selected_features)

Características seleccionadas (top 12):
Index(['duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'loudness', 'speechiness', 'tempo', 'valence',
       'energy_danceability_valence', 'acoustic_intensity',
       'valence_energy_dif'],
      dtype='object')


In [82]:
model_3 = DecisionTreeClassifier(random_state=42)
model_3.fit(X_train_selected, y_train)

In [83]:
y_pred_3 = model_3.predict(X_test_selected)
accuracy_3 = accuracy_score(y_test, y_pred_3)
precision_3 = precision_score(y_test, y_pred_3, average='macro')
recall_3 = recall_score(y_test, y_pred_3, average='macro')
f1_3 = f1_score(y_test, y_pred_3, average='macro')

print("Accuracy:", accuracy_3)
print("Precisión (Precision):", precision_3)
print("Sensibilidad (Recall):", recall_3)
print("F1-Score:", f1_3)

Accuracy: 0.5798249270529388
Precisión (Precision): 0.5801726980729978
Sensibilidad (Recall): 0.5794833537115224
F1-Score: 0.579797233526118


### Feature importance

In [86]:
X_4 = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'version_type_encoded', 'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif',
       'popularity_normalized']]
y_4 = df['decade_label']

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X_4, y_4, test_size=0.2, random_state=42)

model_4 = DecisionTreeClassifier(random_state=42)
model_4.fit(X_train, y_train)

In [92]:
importances = model_4.feature_importances_
importance_df = pd.DataFrame({'Feature': X_4.columns, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

top_features = importance_df.head(10)['Feature'].values
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

importance_df

Unnamed: 0,Feature,Importance
15,popularity_energy_ratio,0.318057
17,popularity_normalized,0.220957
3,energy,0.198062
1,acousticness,0.137942
6,loudness,0.066382
0,duration_ms,0.018302
14,acoustic_intensity,0.010897
7,speechiness,0.004829
2,danceability,0.003977
5,liveness,0.003443


In [93]:
umbral_importancia = 0.03

important_features = importance_df[importance_df['Importance'] >= umbral_importancia]['Feature']

X_filtered = X_4[important_features]

print("Características seleccionadas:")
print(important_features)


Características seleccionadas:
15    popularity_energy_ratio
17      popularity_normalized
3                      energy
1                acousticness
6                    loudness
Name: Feature, dtype: object


In [94]:
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(X_filtered, y_4, test_size=0.2, random_state=42, stratify=y_4)

model_filtered = DecisionTreeClassifier(random_state=42)
model_filtered.fit(X_train_filtered, y_train_filtered)

In [95]:
y_pred_filtered = model_filtered.predict(X_test_filtered)

accuracy_4 = accuracy_score(y_test_filtered, y_pred_filtered)
precision_4 = precision_score(y_test_filtered, y_pred_filtered, average='macro')
recall_4 = recall_score(y_test_filtered, y_pred_filtered, average='macro')
f1_4 = f1_score(y_test_filtered, y_pred_filtered, average='macro')

print("Accuracy:", accuracy_4)
print("Precisión (Precision):", precision_4)
print("Sensibilidad (Recall):", recall_4)
print("F1-Score:", f1_4)

Accuracy: 0.9704564401834097
Precisión (Precision): 0.9704715932522173
Sensibilidad (Recall): 0.9704620358781811
F1-Score: 0.9704666220533467


El modelo está overfitteando demasiado porque las variables son redundantes.

### Variables originales y nuevas sin redundancias

In [101]:
X_5 = df[['duration_ms', 'instrumentalness',
       'liveness', 'speechiness', 'tempo', 'mode',
       'key', 'energy_danceability_valence', 'acoustic_intensity',
       'valence_energy_dif', 'popularity_normalized']]
y_5 = df['decade_label']

In [102]:
X_train, X_test, y_train, y_test = train_test_split(X_5, y, train_size=0.2, random_state=42)
model_5 = DecisionTreeClassifier(random_state=42)
model_5.fit(X_train, y_train)

print(X_5.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(95958, 11)
(19191, 11)
(76767, 11)
(19191,)
(76767,)


In [103]:
y_pred_5 = model_5.predict(X_test)
accuracy_5 = accuracy_score(y_test, y_pred_5)
precision_5 = precision_score(y_test, y_pred_5, average='macro')
recall_5 = recall_score(y_test, y_pred_5, average='macro')
f1_5 = f1_score(y_test, y_pred_5, average='macro')

print("Accuracy:", accuracy_5)
print("Precisión (Precision):", precision_5)
print("Sensibilidad (Recall):", recall_5)
print("F1-Score:", f1_5)

Accuracy: 0.6772571547670223
Precisión (Precision): 0.677781467931395
Sensibilidad (Recall): 0.6772209900656586
F1-Score: 0.677472941392128


<mark>**Conclusión:**</mark> Se observa como las nuevas variables creadas aportan minimamente al modelo (de 0,62 de accuracy a 0,67). Los modelos con las variables originales tiene el mismo rendimiento y es igual de complejo. Por lo tanto, por su facilidad de interpretación y por ser más consistente a datos originales seleccionamos únicamente las variables originales.

### Hiperparametrización 1

In [100]:
X_6 = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity_normalized']]
y_6 = df['decade_label']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X_6, y, train_size=0.2, random_state=42)
model_6 = DecisionTreeClassifier(random_state=42)
model_6.fit(X_train, y_train)

print(X_6.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(95958, 13)
(19191, 13)
(76767, 13)
(19191,)
(76767,)


In [105]:
param_grid = {'max_depth': [5, 10, 15, None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [106]:
best_model = grid_search.best_estimator_
y_pred_6 = best_model.predict(X_test)
accuracy_6 = accuracy_score(y_test, y_pred_6)
precision_6 = precision_score(y_test, y_pred_6, average='macro')
recall_6 = recall_score(y_test, y_pred_6, average='macro')
f1_6 = f1_score(y_test, y_pred_6, average='macro')

print("Accuracy:", accuracy_6)
print("Precisión (Precision):", precision_6)
print("Sensibilidad (Recall):", recall_6)
print("F1-Score:", f1_6)

Accuracy: 0.6588117289981372
Precisión (Precision): 0.6587914585651163
Sensibilidad (Recall): 0.65875353320438
F1-Score: 0.6563713398096723


### Hiperparametrización 2

In [109]:
X_7 = df[['duration_ms', 'instrumentalness',
       'liveness', 'speechiness', 'tempo', 'mode',
       'key', 'energy_danceability_valence', 'acoustic_intensity',
       'valence_energy_dif', 'popularity_normalized']]
y_7 = df['decade_label']

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X_7, y, train_size=0.2, random_state=42)
model_7 = DecisionTreeClassifier(random_state=42)
model_7.fit(X_train, y_train)

print(X_7.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(95958, 11)
(19191, 11)
(76767, 11)
(19191,)
(76767,)


In [111]:
param_grid = {'max_depth': [5, 10, 15, None],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

grid_search_2 = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search_2.fit(X_train, y_train)

In [112]:
best_model = grid_search_2.best_estimator_
y_pred_7 = best_model.predict(X_test)
accuracy_7 = accuracy_score(y_test, y_pred_7)
precision_7 = precision_score(y_test, y_pred_7, average='macro')
recall_7 = recall_score(y_test, y_pred_7, average='macro')
f1_7 = f1_score(y_test, y_pred_7, average='macro')

print("Accuracy:", accuracy_7)
print("Precisión (Precision):", precision_7)
print("Sensibilidad (Recall):", recall_7)
print("F1-Score:", f1_7)

Accuracy: 0.6810608725103234
Precisión (Precision): 0.6811084577013263
Sensibilidad (Recall): 0.6810201914543216
F1-Score: 0.6807280821767087
