In [21]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest

In [22]:
df = pd.read_csv("../data/processed/processed_dummies.csv", index_col=0)

In [23]:
df = df[(df['decade'] >= 1950) & (df['decade'] < 2010)]

In [24]:
df['popularity_normalized'] = (df['popularity'] - df.groupby('decade')['popularity'].transform('mean')) / df.groupby('decade')['popularity'].transform('std')

In [25]:
sample_size = 20000
df_sample = df.sample(n=sample_size, random_state=42)

In [26]:
X = df_sample[['duration_ms', 'acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'mode',
       'key', 'explicit', 'artists_frequency',
       'version_Live', 'version_Original', 'version_Remaster',
       'version_Remix', 'energy_danceability_valence', 'acoustic_intensity',
       'popularity_energy_ratio', 'valence_energy_dif', 'popularity_normalized']]
y = df_sample['decade_label']

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X.shape)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20000, 23)
(16000, 23)
(4000, 23)
(16000,)
(4000,)


In [28]:
#SVM Pipeline
svm = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("pca", PCA(n_components=10)),
    ("svm", SVC())
])

svm_param = {
    "svm__C": [0.1, 1, 10],
    "svm__kernel": ["linear", "rbf"],
    "svm__gamma": ["scale", "auto"],
}

#KNN Pipeline
knn = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("selectkbest", SelectKBest()),
    ("pca", PCA(n_components=10)),
    ("knn", KNeighborsClassifier())
])

knn_param = {
    "knn__n_neighbors": [3, 5, 10],
    "knn__weights": ["uniform", "distance"],
    "knn__p": [1, 2],
}

#GridSearchCV
gs_svm = GridSearchCV(svm, svm_param, cv=3, scoring="accuracy", verbose=1, n_jobs=-1)
gs_knn = GridSearchCV(knn, knn_param, cv=3, scoring="accuracy", verbose=1, n_jobs=-1)


grids = {
    "gs_svm": gs_svm,
    "gs_knn": gs_knn,
}


In [29]:
results = {}


for name, grid in grids.items():
    print(f"Entrenando {name}...")
    grid.fit(X_train, y_train)
    results[name] = grid
    print(f"Mejores parámetros para {name}: {grid.best_params_}")
    print(f"Mejor score para {name}: {grid.best_score_}")

Entrenando gs_svm...
Fitting 3 folds for each of 12 candidates, totalling 36 fits


Mejores parámetros para gs_svm: {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'rbf'}
Mejor score para gs_svm: 0.6545623996512905
Entrenando gs_knn...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Mejores parámetros para gs_knn: {'knn__n_neighbors': 10, 'knn__p': 1, 'knn__weights': 'distance'}
Mejor score para gs_knn: 0.6308119932411412


In [30]:
svm_best = results["gs_svm"].best_estimator_
y_pred_svm = svm_best.predict(X_test)
print("Resultados para SVM:")
print(f"Accuracy en test: {accuracy_score(y_test, y_pred_svm):.4f}")
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred_svm))

Resultados para SVM:
Accuracy en test: 0.6757
Reporte de clasificación:
              precision    recall  f1-score   support

           1       0.67      0.80      0.73      1344
           2       0.62      0.60      0.61      1307
           3       0.75      0.63      0.68      1349

    accuracy                           0.68      4000
   macro avg       0.68      0.68      0.67      4000
weighted avg       0.68      0.68      0.67      4000



In [31]:
knn_best = results["gs_knn"].best_estimator_
y_pred_knn = knn_best.predict(X_test)
print("Resultados para KNN:")
print(f"Accuracy en test: {accuracy_score(y_test, y_pred_knn):.4f}")
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred_knn))

Resultados para KNN:
Accuracy en test: 0.6500
Reporte de clasificación:
              precision    recall  f1-score   support

           1       0.68      0.76      0.71      1344
           2       0.59      0.56      0.57      1307
           3       0.68      0.63      0.65      1349

    accuracy                           0.65      4000
   macro avg       0.65      0.65      0.65      4000
weighted avg       0.65      0.65      0.65      4000



### Guardar modelos

In [32]:
import pickle

with open('SVMpipeline.pkl', 'wb') as f:
    pickle.dump(svm_best, f)

In [33]:
with open('KNNpipeline.pkl', 'wb') as f:
    pickle.dump(knn_best, f)

In [36]:
import yaml

svm_params = results["gs_svm"].best_params_
knn_params = results["gs_knn"].best_params_

In [38]:
with open("svm_params_pipeline.yaml", "w") as svm_file:
    yaml.dump(svm_params, svm_file)

In [39]:
with open("knn_params_pipeline.yaml", "w") as knn_file:
    yaml.dump(knn_params, knn_file)