# import packages

In [1]:
import pandas as pd

from sklearn.compose import make_column_selector, ColumnTransformer, make_column_transformer
# sklearn.compose: The sklearn.compose module is a submodule of the sklearn library for machine learning in Python. It provides functions for creating complex preprocessing and modeling pipelines.
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures,RobustScaler
#sklearn.preprocessing: The sklearn.preprocessing module is a submodule of the sklearn library that provides functions for preprocessing data, such as scaling and normalizing features, imputing missing values, and encoding categorical variables.
from sklearn.linear_model import Ridge,LinearRegression,Lasso, ElasticNet
# sklearn.linear_model: The sklearn.linear_model module is a submodule of the sklearn library that provides functions for fitting linear models for regression and classification.
from sklearn.pipeline import make_pipeline
# sklearn.pipeline: The sklearn.pipeline module is a submodule of the sklearn library that provides functions for creating and working with pipelines of transformers and models.
from sklearn.model_selection import train_test_split,GridSearchCV,learning_curve, RandomizedSearchCV, cross_val_score, KFold
# sklearn.model_selection: The sklearn.model_selection module is a submodule of the sklearn library that provides functions for splitting data into training and test sets, evaluating models using cross-validation, and hyperparameter tuning.
from sklearn.dummy import DummyRegressor
# sklearn.dummy: The sklearn.dummy module is a submodule of the sklearn library that provides simple dummy models for regression and classification.


from sklearn.impute import SimpleImputer
import numpy as np

from sklearn.ensemble import RandomForestClassifier


# Read data

In [2]:
try:
    df = pd.read_csv('data/data_cleans.csv')
except:
    print("Error: no such file csv")


# Préparation à la modélisation

In [3]:
X = df.drop('popularity', axis=1)
y = df['popularity']

In [4]:
X = X[[col for col in X.columns if X[col].dtype != 'object' or col == 'genre']]

In [5]:
def make_pipeline_to_ML(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.8, random_state=42)
    numerical_features = make_column_selector(dtype_include=np.number)
    categorical_features = make_column_selector(dtype_exclude= np.number)
    numerical_pipeline = make_pipeline(StandardScaler(with_mean=False))
    categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
    preprocessor = make_pipeline(make_column_transformer((numerical_pipeline, numerical_features),
                                    (categorical_pipeline, categorical_features)),PolynomialFeatures(2)
                                    )
    return preprocessor, X_train, X_test, y_train, y_test


# Create the pipeline
preprocessor, X_train, X_test, y_train, y_test = make_pipeline_to_ML(X,y)

In [6]:
lasso_model = make_pipeline(preprocessor, Lasso(random_state=42))
params = {
        'lasso__alpha':[0.1,1,10], 
        }
 # Create the grid search object
grid_seargrid_search_lasso = GridSearchCV(lasso_model, params, cv=5, n_jobs = 1, verbose=4)

In [7]:
# # Fit the grid search to the data
# grid_seargrid_search_lasso.fit(X_train, y_train)
# # Print the best parameters and the best score
# print("Best parameters: ", grid_seargrid_search_lasso.best_params_)
# print("Best score: ", grid_seargrid_search_lasso.best_score_)
# best=grid_seargrid_search_lasso.best_params_

In [8]:
best = {'lasso__alpha': 1}

In [9]:
lasso_model.set_params(**best)

In [10]:
lasso_model.fit(X_train,y_train)
lasso_model.score(X_train,y_train)

0.7112738080684001

In [11]:
lasso_model.score(X_test,y_test)

0.71539771866008

In [12]:
import mlflow
experiment_name = "Modelisation"
try:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
except AttributeError:
    experiment_id = mlflow.create_experiment(experiment_name)


eval_data = X_test
eval_data["label"] = y_test

with mlflow.start_run(experiment_id=experiment_id, nested=True, run_name=lasso_model[-1].__class__.__name__):
    mlflow.log_params(lasso_model[-1].get_params())
    mlflow.sklearn.log_model(lasso_model, experiment_name)
    model_uri = mlflow.get_artifact_uri(experiment_name)

    result = mlflow.evaluate(
            model_uri,
            eval_data,
            targets="label",
            model_type="regressor",
            evaluators=["default"],
    )

2023/02/22 16:27:17 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
  from .autonotebook import tqdm as notebook_tqdm
2023/02/22 16:27:19 INFO mlflow.models.evaluation.default_evaluator: Shap explainer Permutation is used.
Permutation explainer: 2001it [17:42,  1.87it/s]                          
Unable to serialize underlying model using MLflow, will use SHAP serialization
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
No data for colormapping provided via 'c'. Parameters 'vmin', 'vmax' will be ignored
