<a href="https://colab.research.google.com/github/Pedropicapapa5/MCD-AA2025/blob/main/Tarea4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Selección de características

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (8,5)

In [5]:
df = pd.read_csv('/content/drive/MyDrive/Spotify Features 3.0.csv')
print('Dimensiones:', df.shape)
df.head()

Dimensiones: (1047, 18)


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,year_release
0,Alternative,3 Doors Down,Kryptonite,6ZOBP3NvffbU4SZcrnt1k6,75,0.00664,0.545,233933,0.865,1.1e-05,B,0.168,-5.708,Minor,0.0286,99.01,0.543,2000
1,Alternative,Counting Crows,Mr. Jones,5DiXcVovI0FcY2s0icWWUu,75,0.211,0.581,272507,0.817,1e-06,C,0.246,-6.542,Major,0.0382,141.607,0.779,1993
2,Alternative,ChocQuibTown,Pa Olvidarte (feat. Manuel Turizo) - Remix,2ktoFujjJtantyMoVdBJlX,76,0.0441,0.724,256418,0.791,7e-06,G,0.239,-5.411,Major,0.169,176.011,0.856,2018
3,Dance,Ariana Grande,"break up with your girlfriend, i'm bored",4kV4N9D1iKVxx1KLvtTpjS,99,0.0421,0.726,190440,0.554,0.0,F,0.106,-5.29,Minor,0.0917,169.999,0.335,2019
4,Dance,Ariana Grande,7 rings,14msK75pk3pA33pzPVNtBF,100,0.578,0.725,178640,0.321,0.0,C#,0.0884,-10.744,Minor,0.323,70.142,0.319,2019


In [9]:
# Columnas
cols = ['genre','artist_name','track_name','track_id','popularity','acousticness','danceability',
'duration_ms','energy','instrumentalness','key','liveness','loudness','mode',
'speechiness','tempo','valence','year_release']
cols = [c for c in cols if c in df.columns]
df = df[cols].copy()

In [10]:
# Creacion duracion en min
if 'duration_ms' in df.columns:
  df['duration_min'] = df['duration_ms']/60000.0
  df = df.drop(columns=['duration_ms'])

In [12]:
# Columnas no numericas
non_numeric = ['genre','artist_name','track_name','track_id']
for c in non_numeric:
  if c not in df.columns:
    non_numeric.remove(c)

In [13]:
# Variables numéricas
num_vars = df.select_dtypes(include=[np.number]).columns.tolist()
num_vars.remove('popularity')
print('Variables numéricas:', num_vars)

Variables numéricas: ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'year_release']


In [14]:
# One-hot encode genre if present
if 'genre' in df.columns:
    df = pd.get_dummies(df, columns=['genre'], drop_first=True)

In [16]:
# Variables X y Y
X = df.drop(columns=['popularity','artist_name','track_name','track_id'], errors='ignore')
# Columnas object
X = X.select_dtypes(include=[np.number])
y = df['popularity']
print('X shape:', X.shape)

X shape: (1047, 10)


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [23]:
## Método filtro: SelectKBest con ANOVA F-test para regresión
k = 7
selector = SelectKBest(score_func=f_regression, k=k)
selector.fit(X_train, y_train)
scores = selector.scores_
pvalues = selector.pvalues_
selected_mask_kbest = selector.get_support()
features_kbest = X.columns[selected_mask_kbest].tolist()

print('Top features (SelectKBest f_regression):')
for f,s,p in zip(X.columns, scores, pvalues):
  print(f'{f}: F={s:.3f}, p={p:.4g}')

print('Seleccionadas:', features_kbest)

Top features (SelectKBest f_regression):
acousticness: F=5.501, p=0.01925
danceability: F=12.164, p=0.0005143
energy: F=0.436, p=0.5091
instrumentalness: F=1.629, p=0.2022
liveness: F=1.930, p=0.1651
loudness: F=6.913, p=0.008725
speechiness: F=0.043, p=0.835
tempo: F=5.648, p=0.01771
valence: F=0.539, p=0.4632
year_release: F=56.507, p=1.534e-13
Seleccionadas: ['acousticness', 'danceability', 'instrumentalness', 'liveness', 'loudness', 'tempo', 'year_release']


In [26]:
## Método embebido: LassoCV (L1) — selección via coeficientes
lasso = LassoCV(cv=5, random_state=42, n_jobs=-1)
lasso.fit(X_train_scaled, y_train)
coef = lasso.coef_
selected_mask_lasso = coef != 0
features_lasso = X.columns[selected_mask_lasso].tolist()
print('Alpha elegido por LassoCV:', lasso.alpha_)
print('Características seleccionadas por Lasso:', features_lasso)
# Mostrar coeficientes
for f,c in zip(X.columns, coef):
  print(f'{f}: coef={c:.4f}')

Alpha elegido por LassoCV: 0.0015940898167779622
Características seleccionadas por Lasso: ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'year_release']
acousticness: coef=0.8538
danceability: coef=0.4396
energy: coef=0.1105
instrumentalness: coef=-0.3152
liveness: coef=0.3485
loudness: coef=0.1737
speechiness: coef=-0.1972
tempo: coef=-0.4079
valence: coef=0.1350
year_release: coef=1.4942


In [27]:
## Basado en árboles: RandomForest + Permutation Importance
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
print('Importancias (MDI) — Random Forest:')
print(feat_imp)

Importancias (MDI) — Random Forest:
year_release        0.221049
tempo               0.130785
liveness            0.100031
acousticness        0.091983
loudness            0.088422
danceability        0.087175
valence             0.085905
speechiness         0.080530
energy              0.075713
instrumentalness    0.038407
dtype: float64


In [29]:
# Permutation importance sobre el set de test (más seguro)
perm = permutation_importance(rf, X_test, y_test, n_repeats=30, random_state=42, n_jobs=-1)
perm_imp = pd.Series(perm.importances_mean, index=X.columns).sort_values(ascending=False)
print('Importancias por Permutation Importance:')
print(perm_imp)

Importancias por Permutation Importance:
year_release        0.721936
tempo               0.157514
liveness            0.136389
acousticness        0.091544
danceability        0.082536
valence             0.080780
speechiness         0.080516
energy              0.063640
loudness            0.062509
instrumentalness    0.020756
dtype: float64


In [30]:
## Comparación y determinación de características relevantes
# DataFrame
res = pd.DataFrame({'SelectKBest': [f in features_kbest for f in X.columns],
                    'Lasso': [f in features_lasso for f in X.columns],
                    'RF_MDI': feat_imp.reindex(X.columns).values,
                    'RF_Permutation': perm_imp.reindex(X.columns).values
                    }, index=X.columns)

In [31]:
# RF importance binary with threshold
res['RF_MDI_bin'] = res['RF_MDI'] >= np.percentile(res['RF_MDI'][res['RF_MDI']>0], 75) if any(res['RF_MDI']>0) else False
res['RF_Permutation_bin'] = res['RF_Permutation'] >= np.percentile(res['RF_Permutation'][res['RF_Permutation']>0], 75) if any(res['RF_Permutation']>0) else False

# Conteo de votos (filtrado por métodos)
res['votes'] = res[['SelectKBest','Lasso','RF_MDI_bin','RF_Permutation_bin']].sum(axis=1)
res_sorted = res.sort_values('votes', ascending=False)
res_sorted

# Features seleccionadas por mayoría (>=2 votes)
selected_final = res_sorted[res_sorted['votes']>=2].index.tolist()
print('Características finales seleccionadas (votos>=2):', selected_final)

Características finales seleccionadas (votos>=2): ['tempo', 'liveness', 'year_release', 'acousticness', 'danceability', 'instrumentalness', 'loudness']


Resultados
- Por qué las características seleccionadas (ej. danceability, energy, loudness, valence) son relevantes:
  - Estas variables están directamente relacionadas con la percepción rítmica y energética de una pista, que influyen en su atractivo.
- Por qué otras características quedaron fuera:
  - Variables con baja varianza, alta correlación con otras (multicolinealidad) o sin relación univariada con popularity fueron descartadas por SelectKBest.
  - Lasso penalizó coeficientes para variables que no aportaban predicción incremental.
  - Random Forest y permutation importance penalizaron variables que no causaban pérdida de desempeño cuando se permutaban.

Recomendaciones:
- Revisar correlación entre features seleccionadas para evitar multicolinealidad en modelos lineales.
- Validar la selección mediante validación cruzada y pruebas en datos fuera de muestra.

Referencias
- Scikit-learn: SelectKBest, f_regression, permutation_importance, SelectFromModel.
- Breiman, L. (2001). Random Forests.
- Lasso: Tibshirani (1996) — L1 regularization for feature selection.
- Molnar, C. (Interpretable ML) — Permutation importance discussion.