In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.svm import SVC, LinearSVC
import warnings
from sklearn.exceptions import ConvergenceWarning 

In [2]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [3]:
df = pd.read_csv("../../data/train/train.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95958 entries, 0 to 95957
Data columns (total 35 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           95958 non-null  object 
 1   name                         95958 non-null  object 
 2   artists                      95958 non-null  object 
 3   duration_ms                  95958 non-null  int64  
 4   release_date                 95958 non-null  object 
 5   year                         95958 non-null  int64  
 6   acousticness                 95958 non-null  float64
 7   danceability                 95958 non-null  float64
 8   energy                       95958 non-null  float64
 9   instrumentalness             95958 non-null  float64
 10  liveness                     95958 non-null  float64
 11  loudness                     95958 non-null  float64
 12  speechiness                  95958 non-null  float64
 13  tempo           

In [5]:
df.head()

Unnamed: 0,id,name,artists,duration_ms,release_date,year,acousticness,danceability,energy,instrumentalness,...,version_Remix,version_type_encoded,decade,decade_range,energy_danceability_valence,acoustic_intensity,popularity_energy_ratio,valence_energy_dif,decade_label,popularity_normalized
0,1xUJ1mdLAysxNBtO9w3J2g,Stompin At The Savoy,['Benny Goodman'],195533,1/1/90,1990,0.836,0.525,0.222,0.0799,...,0,1,1990,90s-00s,0.075175,-9.25034,225.21508,0.423,2,0.370632
1,62fX8EW16l8St2yL8rMer9,In My Room - Remastered,['The Beach Boys'],134133,9/16/63,1963,0.184,0.341,0.416,6.3e-05,...,0,2,1960,50s-60s,0.070786,-1.761616,129.804572,0.083,0,2.548527
2,0vTrXo9zLWib36Us8BQ4sN,State Of Independence - Single Version,['Jon & Vangelis'],262107,1/1/84,1984,0.148,0.752,0.652,0.0455,...,0,1,1980,70s-80s,0.428526,-1.94176,59.815034,0.222,1,0.272035
3,23IoCDFHoz1uM5XELOSg1U,Chandni Raaten Pyar Ki Baaten,"['Hemant Kumar', 'Lata Mangeshkar']",208973,12/1/52,1952,0.99,0.311,0.253,0.000156,...,0,1,1950,50s-60s,0.033204,-11.81466,43.476542,0.169,0,-0.478555
4,5ZrtPphrWFzzamKn3jSnv9,"Adagio for Strings, Op. 11","['Samuel Barber', 'Philadelphia Orchestra', 'E...",463347,1955,1955,0.91,0.0626,0.129,0.78,...,0,1,1950,50s-60s,0.000281,-19.53224,302.302147,-0.0942,0,1.492568


In [6]:
df.columns

Index(['id', 'name', 'artists', 'duration_ms', 'release_date', 'year',
       'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit', 'artists_frequency', 'version_type',
       'base_name', 'version_Live', 'version_Original', 'version_Remaster',
       'version_Remix', 'version_type_encoded', 'decade', 'decade_range',
       'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif', 'decade_label',
       'popularity_normalized'],
      dtype='object')

### SVM

#### Baseline

In [7]:
X = df[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit', 'artists_frequency',
       'version_Live', 'version_Original', 'version_Remaster',
       'version_Remix', 'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif', 'popularity_normalized']]
y = df['decade_label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(95958, 24)
(76766, 24)
(19192, 24)
(76766,)
(19192,)


In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
k = 12
selector = SelectKBest(score_func=f_classif, k=k)

X_train_scaled_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_scaled_selected = selector.transform(X_test_scaled)

selected_features = X.columns[selector.get_support()]
selected_features

Index(['duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'loudness', 'popularity', 'explicit',
       'artists_frequency', 'energy_danceability_valence',
       'acoustic_intensity', 'valence_energy_dif'],
      dtype='object')

Vamos como entre las variables elegidas no hay ninguna variable dummy, por lo que podemos interpretar que no tienen una relevancia importante en el modelo. 

In [11]:
svm_model = LinearSVC(random_state=42, max_iter=20000)
svm_model.fit(X_train_scaled_selected, y_train)

In [12]:
y_pred = svm_model.predict(X_test_scaled_selected)

accuracy_1 = accuracy_score(y_test, y_pred)
precision_1 = precision_score(y_test, y_pred, average='macro')
recall_1 = recall_score(y_test, y_pred, average='macro')
f1_1 = f1_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy_1)
print("Precision:", precision_1)
print("Recall:", recall_1)
print("F1-Score:", f1_1)

Accuracy: 0.7167048770320967
Precision: 0.7102353865231539
Recall: 0.7157955589626153
F1-Score: 0.7083665119535851


#### SVM 2

Probamos a crear una muestra debido al tiempo de procesamiento del SVC.

In [13]:
sample_size = 20000
df_sample = df.sample(n=sample_size, random_state=42)

In [14]:
X_sample = df_sample[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit', 'artists_frequency',
       'version_Live', 'version_Original', 'version_Remaster',
       'version_Remix', 'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif', 'popularity_normalized']]
y_sample = df_sample['decade_label']

In [15]:
X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42, stratify=y_sample)

print(X_sample.shape)
print(X_train_sample.shape)
print(X_test_sample.shape)
print(y_train_sample.shape)
print(y_test_sample.shape)

(20000, 24)
(16000, 24)
(4000, 24)
(16000,)
(4000,)


In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sample)
X_test_scaled = scaler.transform(X_test_sample)

In [17]:
k = 12
selector = SelectKBest(score_func=f_classif, k=k)

X_train_scaled_selected = selector.fit_transform(X_train_scaled, y_train_sample)
X_test_scaled_selected = selector.transform(X_test_scaled)

selected_features = X_sample.columns[selector.get_support()]
selected_features

Index(['duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'loudness', 'popularity', 'explicit',
       'artists_frequency', 'energy_danceability_valence',
       'acoustic_intensity', 'valence_energy_dif'],
      dtype='object')

In [18]:
svm_model = SVC(random_state=42, max_iter=20000)
svm_model.fit(X_train_scaled_selected, y_train_sample)

In [19]:
y_pred_2 = svm_model.predict(X_test_scaled_selected)

accuracy_2 = accuracy_score(y_test_sample, y_pred_2)
precision_2 = precision_score(y_test_sample, y_pred_2, average='macro')
recall_2 = recall_score(y_test_sample, y_pred_2, average='macro')
f1_2 = f1_score(y_test_sample, y_pred_2, average='macro')

print("Accuracy:", accuracy_2)
print("Precision:", precision_2)
print("Recall:", recall_2)
print("F1-Score:", f1_2)

Accuracy: 0.768
Precision: 0.7705668326886316
Recall: 0.7677469676240251
F1-Score: 0.7687561905525978


#### SVM 3

In [20]:
sample_size = 20000
df_sample_2 = df.sample(n=sample_size, random_state=42)

In [21]:
X_sample_2 = df_sample_2[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'explicit', 'version_type_encoded', 'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif', 'popularity_normalized']]
y_sample_2 = df_sample_2['decade_label']

In [22]:
X_train_sample_2, X_test_sample_2, y_train_sample_2, y_test_sample_2 = train_test_split(X_sample_2, y_sample_2, test_size=0.2, random_state=42, stratify=y_sample)

print(X_sample_2.shape)
print(X_train_sample_2.shape)
print(X_test_sample_2.shape)
print(y_train_sample_2.shape)
print(y_test_sample_2.shape)

(20000, 19)
(16000, 19)
(4000, 19)
(16000,)
(4000,)


In [23]:
scaler = StandardScaler()
X_train_scaled_2 = scaler.fit_transform(X_train_sample_2)
X_test_scaled_2 = scaler.transform(X_test_sample_2)

In [None]:
# k = 14
# selector = SelectKBest(score_func=f_classif, k=k)

# X_train_scaled_selected_2 = selector.fit_transform(X_train_scaled_2, y_train_sample_2)
# X_test_scaled_selected_2 = selector.transform(X_test_scaled_2)

# selected_features = X_sample_2.columns[selector.get_support()]
# selected_features

In [25]:
svm_model_2 = SVC(kernel='rbf', random_state=42, max_iter=20000)
svm_model_2.fit(X_train_scaled_2, y_train_sample_2)

In [26]:
y_pred_3 = svm_model_2.predict(X_test_scaled_2)

accuracy_3 = accuracy_score(y_test_sample_2, y_pred_3)
precision_3 = precision_score(y_test_sample_2, y_pred_3, average='macro')
recall_3 = recall_score(y_test_sample_2, y_pred_3, average='macro')
f1_3 = f1_score(y_test_sample_2, y_pred_3, average='macro')

print("Accuracy:", accuracy_3)
print("Precision:", precision_3)
print("Recall:", recall_3)
print("F1-Score:", f1_3)

Accuracy: 0.72975
Precision: 0.7257123966490715
Recall: 0.7292456211264176
F1-Score: 0.7258240492941862


### SVM Outliers e hiperparametrización

In [93]:
sample_size = 20000
df_sample_5 = df.sample(n=sample_size, random_state=42)

In [94]:
X_5 = df_sample_5[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'popularity', 'explicit', 'artists_frequency',
       'version_Live', 'version_Original', 'version_Remaster',
       'version_Remix', 'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif', 'popularity_normalized']]
y_5 = df_sample_5['decade_label']

In [95]:
# Cálculo de outliers
Q1 = X_5.quantile(0.25)
Q3 = X_5.quantile(0.75)
IQR = Q3 - Q1

outlier_mask = (X_5 < (Q1 - 1.5 * IQR)) | (X_5 > (Q3 + 1.5 * IQR))

print("Número de outliers por columna:")
print(outlier_mask.sum().sort_values(ascending=False))


print("\nPorcentaje de outliers por columna:")
print(((outlier_mask.sum() / len(X)) * 100).sort_values(ascending=False))


Número de outliers por columna:
instrumentalness               4259
speechiness                    2529
artists_frequency              1962
popularity_energy_ratio        1840
version_Original               1522
liveness                       1517
explicit                        974
duration_ms                     971
version_Live                    722
version_Remaster                705
loudness                        489
popularity_normalized           423
acoustic_intensity              394
valence_energy_dif              327
tempo                           172
energy_danceability_valence     114
version_Remix                    95
popularity                       16
danceability                      9
energy                            0
acousticness                      0
mode                              0
key                               0
valence                           0
dtype: int64

Porcentaje de outliers por columna:
instrumentalness               3.550705
speechiness   

In [100]:
X_scaled_5 = X_5.copy()
scaler = RobustScaler()
X_scaled_5 = scaler.fit_transform(X_scaled_5)
X_scaled_5 = pd.DataFrame(X_scaled_5, columns=X_5.columns)


In [101]:
X_train_5, X_test_5, y_train_5, y_test_5 = train_test_split(X_scaled_5, y_5, test_size=0.2, random_state=42, stratify=y_5)

print(X_5.shape)
print(X_train_5.shape)
print(X_test_5.shape)
print(y_train_5.shape)
print(y_test_5.shape)

(20000, 24)
(16000, 24)
(4000, 24)
(16000,)
(4000,)


In [102]:
k = 14
selector = SelectKBest(score_func=f_classif, k=k)

X_train_scaled_selected_5 = selector.fit_transform(X_train_5, y_train_5)
X_test_scaled_selected_5 = selector.transform(X_test_5)
selected_features

Index(['duration_ms', 'acousticness', 'danceability', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo',
       'valence', 'explicit', 'energy_danceability_valence',
       'acoustic_intensity', 'valence_energy_dif'],
      dtype='object')

In [103]:
svm_model_5 = SVC(kernel='rbf', random_state=42, max_iter=20000)
svm_model_5.fit(X_train_scaled_selected_5, y_train_5) 

In [104]:
y_pred_5 = svm_model_5.predict(X_test_scaled_selected_5)

accuracy_5 = accuracy_score(y_test_5, y_pred_5)
precision_5 = precision_score(y_test_5, y_pred_5, average='macro')
recall_5 = recall_score(y_test_5, y_pred_5, average='macro')
f1_5 = f1_score(y_test_5, y_pred_5, average='macro')

print("Accuracy:", accuracy_5)
print("Precision:", precision_5)
print("Recall:", recall_5)
print("F1-Score:", f1_5)

Accuracy: 0.7405
Precision: 0.7411663764965262
Recall: 0.7408554752290101
F1-Score: 0.7410072038909679


In [105]:
param_grid = {
    'C': [0.1, 1, 10],    
    'gamma': [0.01, 0.1, 1], 
    'kernel': ['rbf']       
}

In [108]:
grid_search = GridSearchCV(
    SVC(random_state=42, max_iter=10000),
    param_grid=param_grid,
    cv=3,                             
    scoring='accuracy',                  
    verbose=1                            
)

In [109]:
grid_search.fit(X_train_scaled_selected_5, y_train_5)

Fitting 3 folds for each of 9 candidates, totalling 27 fits




In [110]:
print("Mejores Hiperparámetros:", grid_search.best_params_)

Mejores Hiperparámetros: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}


In [111]:
best_svm_model = grid_search.best_estimator_
y_pred_best = best_svm_model.predict(X_test_scaled_selected_5)

In [112]:
accuracy_best = accuracy_score(y_test_5, y_pred_best)
precision_best = precision_score(y_test_5, y_pred_best, average='macro')
recall_best = recall_score(y_test_5, y_pred_best, average='macro')
f1_best = f1_score(y_test_5, y_pred_best, average='macro')

In [113]:
print("\nResultados del Modelo Hiperparametrizado:")
print("Accuracy:", accuracy_best)
print("Precision:", precision_best)
print("Recall:", recall_best)
print("F1-Score:", f1_best)


Resultados del Modelo Hiperparametrizado:
Accuracy: 0.7515
Precision: 0.7531262140648827
Recall: 0.751838412285495
F1-Score: 0.7524310360702987
