In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV, StratifiedKFold,cross_validate,learning_curve
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, PolynomialFeatures, RobustScaler
from sklearn.compose import make_column_transformer
from sklearn.metrics import mean_absolute_error,r2_score
from statsmodels.formula.api import ols
from statsmodels.api import OLS
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif
import pickle


# evaluate an xgboost regression model on the housing dataset
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

In [2]:
df=pd.read_csv('tracks.csv')
df=df.drop(['artists','id_artists','id','name'], axis=1)
df['release_date'] = pd.to_datetime(df['release_date']).dt.strftime('%Y').astype(int)
df

Unnamed: 0,popularity,duration_ms,explicit,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,6,126903,0,1922,0.645,0.4450,0,-13.338,1,0.4510,0.674,0.744000,0.1510,0.1270,104.851,3
1,0,98200,0,1922,0.695,0.2630,0,-22.136,1,0.9570,0.797,0.000000,0.1480,0.6550,102.009,1
2,0,181640,0,1922,0.434,0.1770,1,-21.180,1,0.0512,0.994,0.021800,0.2120,0.4570,130.418,5
3,0,176907,0,1922,0.321,0.0946,7,-27.961,1,0.0504,0.995,0.918000,0.1040,0.3970,169.980,3
4,0,163080,0,1922,0.402,0.1580,3,-16.900,0,0.0390,0.989,0.130000,0.3110,0.1960,103.220,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586667,50,258267,0,2020,0.560,0.5180,0,-7.471,0,0.0292,0.785,0.000000,0.0648,0.2110,131.896,4
586668,72,153293,0,2020,0.765,0.6630,0,-5.223,1,0.0652,0.141,0.000297,0.0924,0.6860,150.091,4
586669,70,187601,0,2020,0.535,0.3140,7,-12.823,0,0.0408,0.895,0.000150,0.0874,0.0663,145.095,4
586670,58,142003,0,2021,0.696,0.6150,10,-6.212,1,0.0345,0.206,0.000003,0.3050,0.4380,90.029,4


In [3]:
X = df.drop(['popularity','explicit','release_date'], axis=1)
y = df['popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y,shuffle=True, train_size=0.80, random_state=42)

In [4]:
lr_model = make_pipeline(StandardScaler(),PolynomialFeatures(2),LinearRegression())
lr_model.fit(X_train,y_train)

In [5]:
y_pred=lr_model.predict(X_train)

In [6]:
mae_score=mean_absolute_error(y_train, y_pred)
mae_score

12.517505517208978

In [7]:
r2_score=r2_score(y_train, y_pred)
r2_score

0.28538241297825695

In [8]:
model = XGBRegressor()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 11.701 (0.036)


In [9]:
model.fit(X_train,y_train)

In [10]:
import joblib
joblib.dump(model,'xgb.pkl')

['xgb.pkl']

In [11]:
y_pred_xgb=model.predict(X_train)

In [13]:
r2_score=r2_score(y_train, y_pred_xgb)

TypeError: 'numpy.float64' object is not callable

In [14]:
N, train_score, val_score = learning_curve(lr_model, X_train, y_train,cv=5)


In [None]:
print(N)
plt.plot(N, train_score.mean(axis=1), label='train')
plt.plot(N, pd.DataFrame(val_score).mean(axis=1), label='validation')
plt.xlabel('train_sizes')
plt.legend()

[ 37546 122027 206507 290988 375469]


<matplotlib.legend.Legend at 0x7fb0a9100580>

In [None]:
import mlflow
import mlflow.sklearn

experiment_id = mlflow.get_experiment_by_name("vinyle_project").experiment_id


In [None]:
experiment_id = mlflow.create_experiment("vinyle_project")

In [None]:
eval_data = X_test
eval_data["label"] = y_test

with mlflow.start_run(experiment_id=experiment_id, nested=True, run_name='regression lineaire default params'):
    mlflow.log_params(lr_model.get_params())
    mlflow.sklearn.log_model(lr_model, "model")
    model_uri = mlflow.get_artifact_uri("model")

    result = mlflow.evaluate(
            model_uri,
            eval_data,
            targets="label",
            model_type="regressor",
            evaluators=["default"],
    )

2023/02/21 15:26:21 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/02/21 15:26:22 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Permutation is used.
Permutation explainer: 2001it [02:53, 10.86it/s]                          
Unable to serialize underlying model using MLflow, will use SHAP serialization
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored


In [None]:
with mlflow.start_run(experiment_id=experiment_id,nested=True, run_name='xgbregressor default params'):
    mlflow.log_params(model.get_params())
    mlflow.sklearn.log_model(model, "model")
    model_uri = mlflow.get_artifact_uri("model")

    result = mlflow.evaluate(
            model_uri,
            eval_data,
            targets="label",
            model_type="regressor",
            evaluators=["default"],
    )

2023/02/21 15:07:32 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2023/02/21 15:07:33 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Tree is used.
Unable to serialize underlying model using MLflow, will use SHAP serialization
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
