# import packages

In [34]:
import pandas as pd

from sklearn.compose import make_column_selector, ColumnTransformer, make_column_transformer
# sklearn.compose: The sklearn.compose module is a submodule of the sklearn library for machine learning in Python. It provides functions for creating complex preprocessing and modeling pipelines.
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures,RobustScaler
#sklearn.preprocessing: The sklearn.preprocessing module is a submodule of the sklearn library that provides functions for preprocessing data, such as scaling and normalizing features, imputing missing values, and encoding categorical variables.
from sklearn.linear_model import Ridge,LinearRegression,Lasso, ElasticNet
# sklearn.linear_model: The sklearn.linear_model module is a submodule of the sklearn library that provides functions for fitting linear models for regression and classification.
from sklearn.pipeline import make_pipeline
# sklearn.pipeline: The sklearn.pipeline module is a submodule of the sklearn library that provides functions for creating and working with pipelines of transformers and models.
from sklearn.model_selection import train_test_split,GridSearchCV,learning_curve, RandomizedSearchCV, cross_val_score, KFold
# sklearn.model_selection: The sklearn.model_selection module is a submodule of the sklearn library that provides functions for splitting data into training and test sets, evaluating models using cross-validation, and hyperparameter tuning.
from sklearn.dummy import DummyRegressor
# sklearn.dummy: The sklearn.dummy module is a submodule of the sklearn library that provides simple dummy models for regression and classification.


from sklearn.impute import SimpleImputer
import numpy as np

from sklearn.ensemble import RandomForestClassifier


# Read data

In [35]:
try:
    df = pd.read_csv('data/data_110k_lignes.csv')
except:
    print("Error: no such file csv")


In [36]:
df['explicit'] = df['explicit'].apply(lambda x : 0 if x == False else 1)
df['is_local'] = df['is_local'].apply(lambda x : 0 if x == False else 1)
del df['is_local']

# Rmove "Unnamed: 0" column

In [37]:
try:
    df.drop('Unnamed: 0', axis=1, inplace=True)
except:
    print("No such column")

# Remove duplicates

In [38]:
df = df.drop_duplicates()



# Préparation à la modélisation

In [39]:
X = df.drop('popularity', axis=1)
y = df['popularity']

In [40]:
X = X[[col for col in X.columns if X[col].dtype != 'object' or col == 'genre']]

In [41]:
def make_pipeline_to_ML(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.8, random_state=42)
    numerical_features = make_column_selector(dtype_include=np.number)
    categorical_features = make_column_selector(dtype_exclude= np.number)
    numerical_pipeline = make_pipeline(StandardScaler(with_mean=False))
    categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
    preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                    (categorical_pipeline, categorical_features)
                                    )
    return preprocessor, X_train, X_test, y_train, y_test


# Create the pipeline
preprocessor, X_train, X_test, y_train, y_test = make_pipeline_to_ML(X,y)

In [71]:
from xgboost import XGBRegressor
xgb_model = make_pipeline(preprocessor, XGBRegressor(random_state=42, tree_method = 'hist'))
params = {
        'xgbregressor__max_depth':[1,5,10], 
        'xgbregressor__n_estimators' : [100,200],
        'xgbregressor__learning_rate' : [0.2, 0.3],
        }
 # Create the grid search object
grid_seargrid_search_xgboost = GridSearchCV(xgb_model, params, cv=5, n_jobs = -1, verbose=4)

In [72]:
# Fit the grid search to the data
grid_seargrid_search_xgboost.fit(X_train, y_train)
# Print the best parameters and the best score
print("Best parameters: ", grid_seargrid_search_xgboost.best_params_)
print("Best score: ", grid_seargrid_search_xgboost.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best parameters:  {'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 200}
Best score:  0.7422000918056552
