<a href="https://colab.research.google.com/github/Pedropicapapa5/MCD-AA2025/blob/main/Tarea6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pronóstico

In [6]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Spotify Features 3.0.csv')
df.head()

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,year_release
0,Alternative,3 Doors Down,Kryptonite,6ZOBP3NvffbU4SZcrnt1k6,75,0.00664,0.545,233933,0.865,1.1e-05,B,0.168,-5.708,Minor,0.0286,99.01,0.543,2000
1,Alternative,Counting Crows,Mr. Jones,5DiXcVovI0FcY2s0icWWUu,75,0.211,0.581,272507,0.817,1e-06,C,0.246,-6.542,Major,0.0382,141.607,0.779,1993
2,Alternative,ChocQuibTown,Pa Olvidarte (feat. Manuel Turizo) - Remix,2ktoFujjJtantyMoVdBJlX,76,0.0441,0.724,256418,0.791,7e-06,G,0.239,-5.411,Major,0.169,176.011,0.856,2018
3,Dance,Ariana Grande,"break up with your girlfriend, i'm bored",4kV4N9D1iKVxx1KLvtTpjS,99,0.0421,0.726,190440,0.554,0.0,F,0.106,-5.29,Minor,0.0917,169.999,0.335,2019
4,Dance,Ariana Grande,7 rings,14msK75pk3pA33pzPVNtBF,100,0.578,0.725,178640,0.321,0.0,C#,0.0884,-10.744,Minor,0.323,70.142,0.319,2019


In [4]:
def mape(y_true, y_pred):
    y_true = np.array(y_true)
    eps = 1e-8
    return np.mean(np.abs((y_true - y_pred) / (y_true + eps))) * 100

In [8]:
# 1) LassoCV (usando X_train_scaled)
numerical_features = [
    'acousticness', 'danceability', 'duration_ms', 'energy',
    'instrumentalness', 'liveness', 'loudness', 'speechiness',
    'tempo', 'valence', 'year_release'
]
X = df[numerical_features]
y = df['popularity']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lasso = LassoCV(cv=5, random_state=42, n_jobs=-1)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)

In [9]:
# Métricas Lasso
mse_l = mean_squared_error(y_test, y_pred_lasso)
rmse_l = np.sqrt(mse_l)
mae_l = mean_absolute_error(y_test, y_pred_lasso)
mape_l = mape(y_test, y_pred_lasso)
r2_l = r2_score(y_test, y_pred_lasso)

In [11]:
print("LASSOCV (alpha=%.4f)" % lasso.alpha_)
print("MAE: %.4f, RMSE: %.4f, MAPE: %.2f%%, R2: %.4f" % (mae_l, rmse_l, mape_l, r2_l))

LASSOCV (alpha=0.0753)
MAE: 4.4992, RMSE: 5.8375, MAPE: 5.32%, R2: 0.0653


In [12]:
# Coeficientes
coef_df = pd.Series(lasso.coef_, index=X.columns).sort_values(key=abs, ascending=False)
print("\nCoeficientes LASSO (abs desc):")
print(coef_df.head(10))


Coeficientes LASSO (abs desc):
year_release        1.294196
acousticness        0.638613
duration_ms        -0.430406
danceability        0.326354
tempo              -0.285847
liveness            0.278862
loudness            0.224570
instrumentalness   -0.182731
speechiness        -0.036245
energy              0.000000
dtype: float64


In [13]:
# Random Forest
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [14]:
# Métricas RF
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mape_rf = mape(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [15]:
print("\nRandomForest")
print("MAE: %.4f, RMSE: %.4f, MAPE: %.2f%%, R2: %.4f" % (mae_rf, rmse_rf, mape_rf, r2_rf))


RandomForest
MAE: 2.0996, RMSE: 3.3228, MAPE: 2.50%, R2: 0.6972


In [16]:
# Importancias de RF
imp = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nTop importancias RF:")
print(imp.head(10))


Top importancias RF:
year_release    0.220592
tempo           0.118957
duration_ms     0.084835
liveness        0.084316
danceability    0.082811
valence         0.081368
loudness        0.080891
acousticness    0.076232
energy          0.072300
speechiness     0.064776
dtype: float64


In [18]:
# Resultados
results = pd.DataFrame({
    'model': ['Lasso', 'RandomForest'],
    'MAE': [mae_l, mae_rf],
    'RMSE': [rmse_l, rmse_rf],
    'MAPE': [mape_l, mape_rf],
    'R2': [r2_l, r2_rf]
})
results.to_csv('supervised_results_summary.csv', index=False)
coef_df.to_csv('lasso_coefficients.csv')
imp.to_csv('rf_feature_importances.csv')
print("\nArchivos guardados: supervised_results_summary.csv, lasso_coefficients.csv, rf_feature_importances.csv")


Archivos guardados: supervised_results_summary.csv, lasso_coefficients.csv, rf_feature_importances.csv
