**Predicción de popularidad de canciones en Spotify**
Maria Lucia Velasquez-Sebastián Mora

In [None]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

# Carga el dataset
data = pd.read_csv("SpotifySongP.csv", error_bad_lines=False, delimiter=',')
df_clean = data.dropna(how='any')

# Eliminar filas con valores no numéricos en 'popularity'
df_clean = df_clean[~df_clean['popularity'].isin(['ScreaMER'])]

# Eliminar filas con valores faltantes en 'popularity'
df_clean = df_clean.dropna(subset=['popularity'])
df_clean = df_clean.drop(27886)

# Imputar valores faltantes en 'popularity'
imputer = SimpleImputer(strategy='most_frequent')
df_clean['popularity'] = imputer.fit_transform(df_clean[['popularity']])

# Separar características (X) y variable objetivo (y)
X = df_clean[['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']]
y = df_clean['popularity']

# Definir modelos y parámetros para GridSearchCV
models = [
    ('Linear Regression', LinearRegression(), {}),
    ('Ridge', Ridge(), {'alpha': [0.1, 1.0, 10.0]}),
    ('Lasso', Lasso(), {'alpha': [0.1, 1.0, 10.0]}),
    ('Random Forest', RandomForestRegressor(), {'n_estimators': [50, 100, 200]})
]

# Realizar validación cruzada con GridSearchCV
best_models = []
best_scores = []
best_params = []

for name, model, params in models:
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X, y)
    best_models.append(name)
    best_scores.append(grid_search.best_score_)
    best_params.append(grid_search.best_params_)

# Imprimir los mejores modelos, puntajes y parámetros
for name, score, params in zip(best_models, best_scores, best_params):
    print(f"Modelo: {name}")
    print(f"Neg-MSE: {score}")
    print(f"RMSE: {abs(score) ** 0.5}")
    print(f"Mejores parámetros: {params}")
    print()





  data = pd.read_csv("SpotifySongP.csv", error_bad_lines=False, delimiter=',')


Modelo: Linear Regression
Neg-MSE: -409.23225691079836
RMSE: 20.229489783748832
Mejores parámetros: {}

Modelo: Ridge
Neg-MSE: -409.2145178262232
RMSE: 20.229051332828814
Mejores parámetros: {'alpha': 10.0}

Modelo: Lasso
Neg-MSE: -410.9271983131269
RMSE: 20.271339332000906
Mejores parámetros: {'alpha': 0.1}

Modelo: Random Forest
Neg-MSE: -383.54391206106504
RMSE: 19.584277164630432
Mejores parámetros: {'n_estimators': 200}

