# import packages

In [1]:
import pandas as pd

from sklearn.compose import make_column_selector, ColumnTransformer, make_column_transformer
# sklearn.compose: The sklearn.compose module is a submodule of the sklearn library for machine learning in Python. It provides functions for creating complex preprocessing and modeling pipelines.
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures,RobustScaler
#sklearn.preprocessing: The sklearn.preprocessing module is a submodule of the sklearn library that provides functions for preprocessing data, such as scaling and normalizing features, imputing missing values, and encoding categorical variables.
from sklearn.linear_model import Ridge,LinearRegression,Lasso, ElasticNet
# sklearn.linear_model: The sklearn.linear_model module is a submodule of the sklearn library that provides functions for fitting linear models for regression and classification.
from sklearn.pipeline import make_pipeline
# sklearn.pipeline: The sklearn.pipeline module is a submodule of the sklearn library that provides functions for creating and working with pipelines of transformers and models.
from sklearn.model_selection import train_test_split,GridSearchCV,learning_curve, RandomizedSearchCV, cross_val_score, KFold
# sklearn.model_selection: The sklearn.model_selection module is a submodule of the sklearn library that provides functions for splitting data into training and test sets, evaluating models using cross-validation, and hyperparameter tuning.
from sklearn.dummy import DummyRegressor
# sklearn.dummy: The sklearn.dummy module is a submodule of the sklearn library that provides simple dummy models for regression and classification.


from sklearn.impute import SimpleImputer
import numpy as np

from sklearn.ensemble import RandomForestClassifier
import functions as fun

# Read data

In [2]:
try:
    df = pd.read_csv('data/data_110k_lignes.csv')
except:
    print("Error: no such file csv")


In [3]:
df = fun.clean_data(df)
df = fun.groupby_track_id(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['explicit'] = df['explicit'].apply(lambda x : 0 if x == False else 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df[df['track_number'] > 100].index, axis=0, inplace=True)


# Préparation à la modélisation

In [4]:
X = df.drop('popularity', axis=1)
y = df['popularity']

In [5]:
X = X[[col for col in X.columns if X[col].dtype != 'object' or col == 'genre']]

In [6]:
def make_pipeline_to_ML(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.8, random_state=42)
    numerical_features = make_column_selector(dtype_include=np.number)
    categorical_features = make_column_selector(dtype_exclude= np.number)
    numerical_pipeline = make_pipeline(StandardScaler(with_mean=False))
    categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
    preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                    (categorical_pipeline, categorical_features)
                                    )
    return preprocessor, X_train, X_test, y_train, y_test


# Create the pipeline
preprocessor, X_train, X_test, y_train, y_test = make_pipeline_to_ML(X,y)

In [7]:
from xgboost import XGBRegressor
xgb_model = make_pipeline(preprocessor, XGBRegressor(random_state=42, tree_method = 'hist'))
params = {
        'xgbregressor__max_depth':[1,5,10,15], 
        'xgbregressor__n_estimators' : [100,200],
        'xgbregressor__learning_rate' : [0.1,0.2, 0.3,0.4],
        }
 # Create the grid search object
grid_seargrid_search_xgboost = GridSearchCV(xgb_model, params, cv=5, n_jobs = -1, verbose=4)

In [8]:
# Fit the grid search to the data
grid_seargrid_search_xgboost.fit(X_train, y_train)
# Print the best parameters and the best score
print("Best parameters: ", grid_seargrid_search_xgboost.best_params_)
print("Best score: ", grid_seargrid_search_xgboost.best_score_)
best=grid_seargrid_search_xgboost.best_params_

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 2/5] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=1, xgbregressor__n_estimators=100;, score=0.342 total time=   2.3s
[CV 1/5] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=1, xgbregressor__n_estimators=100;, score=0.342 total time=   2.3s
[CV 3/5] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=1, xgbregressor__n_estimators=100;, score=0.340 total time=   2.4s
[CV 5/5] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=1, xgbregressor__n_estimators=100;, score=0.337 total time=   2.8s
[CV 4/5] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=1, xgbregressor__n_estimators=100;, score=0.343 total time=   2.7s
[CV 1/5] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=1, xgbregressor__n_estimators=200;, score=0.461 total time=   3.7s
[CV 2/5] END xgbregressor__learning_rate=0.1, xgbregressor__max_depth=1, xgbregressor__n_estimators=200;, score=0.46

In [9]:
# best =  {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 10, 'xgbregressor__n_estimators': 200}
# # Best score:  0.7434567181535039

In [10]:
xgb_model.set_params(**best)

In [11]:
xgb_model.fit(X_train,y_train)
xgb_model.score(X_test,y_test)

0.7849207385384697

In [12]:
import mlflow
experiment_name = "Modelisation"
try:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
except AttributeError:
    experiment_id = mlflow.create_experiment(experiment_name)


eval_data = X_test
eval_data["label"] = y_test

with mlflow.start_run(experiment_id=experiment_id, nested=True, run_name=xgb_model[-1].__class__.__name__):
    mlflow.log_params(xgb_model[-1].get_params())
    mlflow.sklearn.log_model(xgb_model, experiment_name)
    model_uri = mlflow.get_artifact_uri(experiment_name)

    result = mlflow.evaluate(
            model_uri,
            eval_data,
            targets="label",
            model_type="regressor",
            evaluators=["default"],
    )

2023/02/23 15:55:23 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
  from .autonotebook import tqdm as notebook_tqdm
2023/02/23 15:55:24 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Permutation is used.
Permutation explainer: 2001it [02:40, 11.83it/s]                          
Unable to serialize underlying model using MLflow, will use SHAP serialization
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
