In [1]:
import pandas as pd
import numpy as np
import math

import os

from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, learning_curve, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LassoCV, RidgeCV, ElasticNetCV
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder, StandardScaler, RobustScaler, FunctionTransformer, MinMaxScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 200

In [3]:
df = pd.read_csv('../data/data_110k_lignes.csv')

df.head(10)
df["track_id"].drop_duplicates(inplace=True)

In [4]:
col_drop = df.select_dtypes('object').columns.to_list()
col_drop.remove('genre')
df.drop(col_drop, axis=1, inplace=True)
df['explicit'] = df['explicit'].apply(lambda x : 0 if x == False else 1)
df['is_local'] = df['is_local'].apply(lambda x : 0 if x == False else 1)

df

Unnamed: 0.1,Unnamed: 0,popularity,duration_ms,explicit,is_local,track_number,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms.1,time_signature
0,0,66,230666,0,0,1,acoustic,0.676,0.461,1,-6.746,0,0.1430,0.0322,0.000001,0.3580,0.7150,87.917,230667,4
1,1,64,265843,0,0,1,acoustic,0.650,0.271,9,-11.081,1,0.0314,0.7940,0.000069,0.1960,0.2100,78.036,265843,4
2,2,53,180493,0,0,1,acoustic,0.758,0.722,1,-6.252,1,0.0511,0.1650,0.000006,0.1060,0.6600,146.031,180493,4
3,3,55,224333,0,0,4,acoustic,0.678,0.277,11,-10.537,1,0.0602,0.8430,0.000004,0.1040,0.2380,76.935,224333,4
4,4,54,215213,0,0,3,acoustic,0.566,0.733,4,-5.302,1,0.0699,0.0830,0.000002,0.3620,0.6360,178.828,215213,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110153,110153,3,308066,0,0,42,world-music,0.453,0.704,1,-4.620,1,0.0351,0.2260,0.000000,0.1180,0.2710,144.004,308067,4
110154,110154,5,230673,0,0,57,world-music,0.431,0.734,1,-4.561,1,0.0383,0.1210,0.000000,0.1040,0.3630,159.974,230673,4
110155,110155,5,339994,0,0,89,world-music,0.362,0.728,0,-8.105,1,0.0460,0.0270,0.000000,0.0681,0.0557,163.942,339994,4
110156,110156,4,198873,0,0,60,world-music,0.415,0.700,4,-6.059,1,0.0803,0.4520,0.000000,0.1560,0.5320,86.048,198873,4


In [5]:
df.dtypes

Unnamed: 0            int64
popularity            int64
duration_ms           int64
explicit              int64
is_local              int64
track_number          int64
genre                object
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms.1         int64
time_signature        int64
dtype: object

In [6]:
X = df.drop('popularity', axis=1)
y = df['popularity']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True,stratify=X["genre"], train_size=0.8, random_state=42)
X_train.head()

Unnamed: 0.1,Unnamed: 0,duration_ms,explicit,is_local,track_number,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms.1,time_signature
2165,2165,197253,0,0,3,grunge,0.471,0.803,4,-3.126,0,0.0384,5.8e-05,7e-06,0.11,0.237,142.92,197253,4
97311,97311,173360,1,0,1,latino,0.827,0.496,2,-11.985,1,0.199,0.501,0.0121,0.11,0.451,124.008,173360,4
39236,39236,192682,0,0,1,turkish,0.76,0.738,4,-6.901,0,0.0467,0.151,2.3e-05,0.352,0.464,94.054,192683,4
490,490,152500,0,0,1,brazil,0.67,0.946,1,-6.361,1,0.074,0.00193,0.463,0.384,0.0792,125.991,152500,4
7916,7916,222171,0,0,1,happy,0.4,0.954,8,-3.259,1,0.0597,0.000145,0.0559,0.311,0.31,174.943,222171,4


In [8]:
dummy = DummyRegressor()

dummy.fit(X_train, y_train)
dummy.score(X_test, y_test)

-1.7716689591829038e-05

In [9]:
df.columns

Index(['Unnamed: 0', 'popularity', 'duration_ms', 'explicit', 'is_local',
       'track_number', 'genre', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms.1', 'time_signature'],
      dtype='object')

In [10]:
num_columns = [col for col in df.columns if df[col].dtype != "object"]
cat_columns = [col for col in df.columns if df[col].dtype == "object"]
std = StandardScaler()
ohe = OneHotEncoder(handle_unknown="ignore",sparse_output=False)
encoder = ColumnTransformer(
    transformers=[
        ('ohe', ohe, cat_columns)
    ],n_jobs=-1
)

In [11]:
models = [Lasso(), Ridge(), ElasticNet()]

In [12]:

list_models = []
dict_models_fit = {}

for model in models:
    pipe = Pipeline([
        ('enc', encoder),
        ("std",std),
        ('poly', PolynomialFeatures(2)), 
        ('model', model),
    ])

    the_model = pipe.fit(X_train, y_train)
    dict_models_fit[f'{model.__class__.__name__}'] = the_model

    score = pipe.score(X_test, y_test)
    score = round(score,4)

    prediction = pipe.predict(X_test)
    score_rmse = math.sqrt(mean_squared_error(y_test, prediction))
    score_mae = mean_absolute_error(y_test, prediction)

    list_models.append([f"{model.__class__.__name__}", f"{round(score *100, 2)}%", f"{round(score_rmse,2)}"])
    
    print(f"Nom du modèle : {model.__class__.__name__}, R2 score : {round(score *100, 2)}%, RMSE score : {round(score_rmse,2)}, MAE : {round(score_mae,2)}")

Nom du modèle : Lasso, R2 score : 70.32%, RMSE score : 10.03, MAE : 7.73
Nom du modèle : Ridge, R2 score : 70.6%, RMSE score : 9.98, MAE : 7.64
Nom du modèle : ElasticNet, R2 score : 70.51%, RMSE score : 10.0, MAE : 7.68


In [13]:
df_list_models = pd.DataFrame(list_models,columns=['modele', "r2_score", "score_rmse"])

df_list_models.sort_values('r2_score', ascending=False)

Unnamed: 0,modele,r2_score,score_rmse
1,Ridge,70.6%,9.98
2,ElasticNet,70.51%,10.0
0,Lasso,70.32%,10.03


In [14]:
df['popularity'].describe()

count    110158.00000
mean         30.64068
std          18.44315
min           0.00000
25%          16.00000
50%          31.00000
75%          44.00000
max          92.00000
Name: popularity, dtype: float64

# MLFlow

In [15]:
#from run_experiment import run_experiment
import mlflow

In [16]:
experiment_name = "Brief"

In [17]:
try:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
except AttributeError:
    experiment_id = mlflow.create_experiment(experiment_name)


eval_data = X_test
eval_data["label"] = y_test
for key,items in dict_models_fit.items():
    with mlflow.start_run(experiment_id=experiment_id, nested=True, run_name=key):
        mlflow.log_params(items.get_params())
        mlflow.sklearn.log_model(items, experiment_name)
        model_uri = mlflow.get_artifact_uri(experiment_name)

        result = mlflow.evaluate(
                model_uri,
                eval_data,
                targets="label",
                model_type="regressor",
                evaluators=["default"],
        )
    

2023/02/22 10:26:00 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/02/22 10:26:03 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/02/22 10:26:07 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
