In [24]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error


In [25]:
datos = pd.read_csv("mxmh_survey_results.csv")
datos.columns

Index(['Timestamp', 'Age', 'Primary streaming service', 'Hours per day',
       'While working', 'Instrumentalist', 'Composer', 'Fav genre',
       'Exploratory', 'Foreign languages', 'BPM', 'Frequency [Classical]',
       'Frequency [Country]', 'Frequency [EDM]', 'Frequency [Folk]',
       'Frequency [Gospel]', 'Frequency [Hip hop]', 'Frequency [Jazz]',
       'Frequency [K pop]', 'Frequency [Latin]', 'Frequency [Lofi]',
       'Frequency [Metal]', 'Frequency [Pop]', 'Frequency [R&B]',
       'Frequency [Rap]', 'Frequency [Rock]', 'Frequency [Video game music]',
       'Anxiety', 'Depression', 'Insomnia', 'OCD', 'Music effects',
       'Permissions'],
      dtype='object')

In [26]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Timestamp                     736 non-null    object 
 1   Age                           735 non-null    float64
 2   Primary streaming service     735 non-null    object 
 3   Hours per day                 736 non-null    float64
 4   While working                 733 non-null    object 
 5   Instrumentalist               732 non-null    object 
 6   Composer                      735 non-null    object 
 7   Fav genre                     736 non-null    object 
 8   Exploratory                   736 non-null    object 
 9   Foreign languages             732 non-null    object 
 10  BPM                           629 non-null    float64
 11  Frequency [Classical]         736 non-null    object 
 12  Frequency [Country]           736 non-null    object 
 13  Frequ

Hours per day, se queda en los datos sin mayor cambio

Se cambio While working de Yes o No a lago binario, 1 y 0

In [27]:
datos['While working'] = datos['While working'].map({'Yes': 1, 'No': 0})

Eliminar NAH de age

In [28]:
datos = datos.dropna(subset=['Age'])


'Never' 'Nunca' = 0
'Rarely' 'Rara vez' = 1
'Sometimes' 'A veces' = 2
'Very frequently' 'Muy frecuentemente' = 3


'Rara vez' 'Nunca' 'Muy frecuentemente' 'A veces'

In [29]:
datos["Frequency [Classical]"].unique()
columnas_frecuencias = [
    "Frequency [Classical]", "Frequency [Country]", "Frequency [EDM]", "Frequency [Folk]",
    "Frequency [Gospel]", "Frequency [Hip hop]", "Frequency [Jazz]", "Frequency [K pop]",
    "Frequency [Latin]", "Frequency [Lofi]", "Frequency [Metal]", "Frequency [Pop]",
    "Frequency [R&B]", "Frequency [Rap]", "Frequency [Rock]", "Frequency [Video game music]"
]

for columna in columnas_frecuencias :
    print(f"{columna}: {datos[columna].unique()}")

Frequency [Classical]: ['Rarely' 'Sometimes' 'Never' 'Very frequently']
Frequency [Country]: ['Never' 'Sometimes' 'Very frequently' 'Rarely']
Frequency [EDM]: ['Rarely' 'Never' 'Very frequently' 'Sometimes']
Frequency [Folk]: ['Never' 'Rarely' 'Sometimes' 'Very frequently']
Frequency [Gospel]: ['Never' 'Sometimes' 'Rarely' 'Very frequently']
Frequency [Hip hop]: ['Sometimes' 'Rarely' 'Never' 'Very frequently']
Frequency [Jazz]: ['Never' 'Very frequently' 'Rarely' 'Sometimes']
Frequency [K pop]: ['Very frequently' 'Rarely' 'Sometimes' 'Never']
Frequency [Latin]: ['Very frequently' 'Sometimes' 'Never' 'Rarely']
Frequency [Lofi]: ['Rarely' 'Sometimes' 'Very frequently' 'Never']
Frequency [Metal]: ['Never' 'Sometimes' 'Rarely' 'Very frequently']
Frequency [Pop]: ['Very frequently' 'Sometimes' 'Rarely' 'Never']
Frequency [R&B]: ['Sometimes' 'Never' 'Very frequently' 'Rarely']
Frequency [Rap]: ['Very frequently' 'Rarely' 'Never' 'Sometimes']
Frequency [Rock]: ['Never' 'Very frequently' 'Rare

In [30]:

mapeo_frecuencia = {
    'Never': 0,
    'Rarely': 1,
    'Sometimes': 2,
    'Very frequently': 3
}


for columna in columnas_frecuencias:
    datos[columna] = datos[columna].map(mapeo_frecuencia)

In [31]:
for columna in columnas_frecuencias:
    print(f"{columna}: {datos[columna].unique()}")

Frequency [Classical]: [1 2 0 3]
Frequency [Country]: [0 2 3 1]
Frequency [EDM]: [1 0 3 2]
Frequency [Folk]: [0 1 2 3]
Frequency [Gospel]: [0 2 1 3]
Frequency [Hip hop]: [2 1 0 3]
Frequency [Jazz]: [0 3 1 2]
Frequency [K pop]: [3 1 2 0]
Frequency [Latin]: [3 2 0 1]
Frequency [Lofi]: [1 2 3 0]
Frequency [Metal]: [0 2 1 3]
Frequency [Pop]: [3 2 1 0]
Frequency [R&B]: [2 0 3 1]
Frequency [Rap]: [3 1 0 2]
Frequency [Rock]: [0 3 1 2]
Frequency [Video game music]: [2 1 3 0]


In [32]:
datos.info()

<class 'pandas.core.frame.DataFrame'>
Index: 735 entries, 0 to 735
Data columns (total 33 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Timestamp                     735 non-null    object 
 1   Age                           735 non-null    float64
 2   Primary streaming service     734 non-null    object 
 3   Hours per day                 735 non-null    float64
 4   While working                 732 non-null    float64
 5   Instrumentalist               731 non-null    object 
 6   Composer                      734 non-null    object 
 7   Fav genre                     735 non-null    object 
 8   Exploratory                   735 non-null    object 
 9   Foreign languages             731 non-null    object 
 10  BPM                           629 non-null    float64
 11  Frequency [Classical]         735 non-null    int64  
 12  Frequency [Country]           735 non-null    int64  
 13  Frequency 

In [33]:
columnas_a_eliminar = [ 'Timestamp','Primary streaming service',
       'While working', 'Instrumentalist', 'Composer', 'Fav genre',
       'Exploratory', 'Foreign languages', 'BPM', 'Music effects',
       'Permissions']


datos_musica = datos.drop(columns=columnas_a_eliminar)

datos_musica.info()

<class 'pandas.core.frame.DataFrame'>
Index: 735 entries, 0 to 735
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Age                           735 non-null    float64
 1   Hours per day                 735 non-null    float64
 2   Frequency [Classical]         735 non-null    int64  
 3   Frequency [Country]           735 non-null    int64  
 4   Frequency [EDM]               735 non-null    int64  
 5   Frequency [Folk]              735 non-null    int64  
 6   Frequency [Gospel]            735 non-null    int64  
 7   Frequency [Hip hop]           735 non-null    int64  
 8   Frequency [Jazz]              735 non-null    int64  
 9   Frequency [K pop]             735 non-null    int64  
 10  Frequency [Latin]             735 non-null    int64  
 11  Frequency [Lofi]              735 non-null    int64  
 12  Frequency [Metal]             735 non-null    int64  
 13  Frequency 

In [34]:
X = datos_musica.drop(columns=["Anxiety", "Depression", "Insomnia", "OCD"])
yA = datos_musica["Anxiety"]
yD = datos_musica["Depression"]
yI = datos_musica["Insomnia"]
yO = datos_musica["OCD"]

X.shape, yA.shape, yD.shape, yI.shape, yO.shape


((735, 18), (735,), (735,), (735,), (735,))

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, yA, test_size=0.3, random_state=42)

Entrenar

In [36]:
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=10,
    random_state=42
)

rf.fit(X_train, y_train)


evaluacion

In [37]:
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

In [38]:

print("entrenamiento R**2:", r2_score(y_train, y_pred_train))
print("Test R**2:", r2_score(y_test, y_pred_test))


entrenamiento R**2: 0.24064381101420385
Test R**2: 0.026615757293565134


Nuevamente el modelo no predice nada importante a menos que se tenga a si mismo como vecino, el modelo de Knn no se ajusta

In [41]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])


param_grid = {
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 5, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
}


grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Mejor R² (CV):", grid_search.best_score_)
print("Mejores parámetros:", grid_search.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Mejor R² (CV): 0.023986114212614918
Mejores parámetros: {'rf__max_depth': 5, 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}


In [42]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=5,
    min_samples_leaf=2,
    random_state=42
)

rf.fit(X_train, y_train)
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

print("entrenamiento R**2:", r2_score(y_train, y_pred_train))
print("Test R**2:", r2_score(y_test, y_pred_test))


entrenamiento R**2: 0.3332654487783724
Test R**2: 0.0047187250940637915


No se puede ver una mejora significativa