In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

#Regression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import QuantileRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import BayesianRidge
from sklearn.cross_decomposition import PLSRegression
from lifelines import CoxPHFitter
from sklearn.neighbors import KNeighborsRegressor
from pygam import LinearGAM, s


import sys,os
current_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(current_dir, "..")))
from utils.utils import clean_data

# Regression

## Load Data

In [2]:
df=pd.read_csv("../data/mxmh_survey_results.csv")
df=clean_data(df)

__________________
Drop col : BPM
__________________
Drop Na
Total rows : 736
Total rows after drop na : 718
__________________
Replace str by float in each columns


## Prepare models

In [3]:
LR=LinearRegression()
ridge = Ridge(alpha=1.0)
lasso=Lasso(alpha=0.1)
elas_net= ElasticNet(alpha=0.1, l1_ratio=0.5)
PR=PoissonRegressor(alpha=1.0)
QR=QuantileRegressor(alpha=0.1, quantile=0.5)
SVR=SVR(kernel='rbf', C=1.0, epsilon=0.1)
RFR=RandomForestRegressor(n_estimators=100)
xgb=xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
lgb=lgb.LGBMRegressor(n_estimators=100)
BR = BayesianRidge()
PLS= PLSRegression(n_components=2)
gam = LinearGAM(s(0) + s(1))
KNR=KNeighborsRegressor(n_neighbors=5)

models={'linear_regression':LR,'ridge':ridge,'lasso':lasso,'elastic_net':elas_net,'poisson_regressor':PR,'quantile_regressor':QR,'svr':SVR,
        'random_forest_regressor':RFR,'xgb':xgb,'lgb':lgb,'bayesian_ridge':BR,'PLS_regression':PLS,'linear_gam':gam,'KNeighborsRegressor':KNR}

## Regression for Anxiety

In [4]:
df_anxiety = df.drop(columns=['Timestamp', 'Depression', 'Insomnia','OCD','Permissions'])

In [5]:
df_anxiety

Unnamed: 0,Age,Primary streaming service,Hours per day,While working,Instrumentalist,Composer,Fav genre,Exploratory,Foreign languages,Frequency [Classical],...,Frequency [Latin],Frequency [Lofi],Frequency [Metal],Frequency [Pop],Frequency [R&B],Frequency [Rap],Frequency [Rock],Frequency [Video game music],Anxiety,Music effects
2,18.0,2,4.0,0,0,0,2,0,1,0,...,0,2,2,1,0,1,1,3,7.0,0
3,61.0,4,2.5,1,0,1,3,1,1,2,...,3,2,0,2,2,0,0,0,9.0,1
4,18.0,2,4.0,1,0,0,4,1,0,0,...,2,2,0,2,3,3,0,1,7.0,1
5,18.0,2,5.0,1,1,1,3,1,1,1,...,1,3,1,3,3,3,3,0,8.0,1
6,18.0,4,3.0,1,1,0,2,1,1,2,...,1,1,1,1,1,0,0,2,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,17.0,2,2.0,1,1,0,1,1,1,3,...,2,1,1,3,0,1,3,0,7.0,1
732,18.0,2,1.0,1,1,0,9,1,1,1,...,0,1,0,3,0,0,2,2,3.0,1
733,19.0,5,6.0,1,0,1,10,1,0,1,...,1,2,1,2,2,2,1,1,2.0,1
734,19.0,2,5.0,1,1,0,11,0,0,3,...,0,0,0,0,0,0,0,2,2.0,1


In [6]:



# Split Dataframe into (x,Y)
X = df_anxiety.drop(columns=['Anxiety'])
Y = df_anxiety['Anxiety']

# Afficher X et Y
print("X (caractéristiques) :")
print(X)
print("\nY (cible) :")
print(Y)

# Diviser en ensembles d'entraînement et de test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Optionnel : Standardiser les caractéristiques
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nX_train standardisé :")
print(X_train_scaled)


X (caractéristiques) :
      Age  Primary streaming service  Hours per day  While working  \
2    18.0                          2            4.0              0   
3    61.0                          4            2.5              1   
4    18.0                          2            4.0              1   
5    18.0                          2            5.0              1   
6    18.0                          4            3.0              1   
..    ...                        ...            ...            ...   
731  17.0                          2            2.0              1   
732  18.0                          2            1.0              1   
733  19.0                          5            6.0              1   
734  19.0                          2            5.0              1   
735  29.0                          4            2.0              1   

     Instrumentalist  Composer  Fav genre  Exploratory  Foreign languages  \
2                  0         0          2            0     

In [14]:
dict_result={}
for name,model in models.items():
    dict_result[name]={"MSE":None,"R2":None}
    # Entraîner le modèle
    model.fit(X_train_scaled, Y_train)

    # Faire des prédictions sur les données de test
    Y_pred = model.predict(X_test_scaled)

    # Évaluer les performances du modèle
    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    dict_result[name]["MSE"]=mse
    dict_result[name]["R2"]=r2

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 186
[LightGBM] [Info] Number of data points in the train set: 574, number of used features: 26
[LightGBM] [Info] Start training from score 5.825784


In [15]:
dict_result

{'linear_regression': {'MSE': 8.83664201884823, 'R2': -0.029212441337410855},
 'ridge': {'MSE': 8.832816835327414, 'R2': -0.028766918427053145},
 'lasso': {'MSE': 8.219090846216083, 'R2': 0.04271435491390507},
 'elastic_net': {'MSE': 8.387114137730713, 'R2': 0.02314451586277433},
 'poisson_regressor': {'MSE': 8.59301793872048, 'R2': -0.0008373036163122904},
 'quantile_regressor': {'MSE': 8.598958333333334, 'R2': -0.001529187255930431},
 'svr': {'MSE': 8.857705521662501, 'R2': -0.03166572835620718},
 'random_forest_regressor': {'MSE': 8.380663888888888,
  'R2': 0.02389578272903059},
 'xgb': {'MSE': 9.438170179432275, 'R2': -0.09927302151791229},
 'lgb': {'MSE': 9.23834542789305, 'R2': -0.07599923494461236},
 'bayesian_ridge': {'MSE': 8.412651165829374, 'R2': 0.02017019292676958},
 'PLS_regression': {'MSE': 8.629997791392055, 'R2': -0.005144383655000029},
 'linear_gam': {'MSE': 8.397698605401263, 'R2': 0.02191173243800426},
 'KNeighborsRegressor': {'MSE': 8.736458333333335,
  'R2': -0.01