### CATBOOST Model with CatBoostEncoder

In [198]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import category_encoders as ce
from catboost import CatBoostRegressor    # For regression tasks use CatBoostRegressor, for classification use CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error  # For regression tasks
import pickle

In [199]:
# import dataset
df = pd.read_csv('dataset_gold.csv')

In [200]:
X = df.drop(['boxoffice','titre','date','acteurs','awards','box_office_total',
        'description','nominations','note_presse','note_spectateurs', 'nombre_article','genres'], axis=1)
y = df['boxoffice']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

In [201]:
# Identifier les caractéristiques catégorielles
categorical_features = ['distributeur','réalisateur', 'nationalités', 'langue_d_origine','type_film','acteurs_connus']

# Sélectionner automatiquement les caractéristiques numériques en filtrant les colonnes du DataFrame X
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

cbe_encoder = ce.cat_boost.CatBoostEncoder()

# Créer un pipeline pour les prétraitements
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', cbe_encoder, categorical_features)
    ])

# Créer le modèle CatBoost
model = CatBoostRegressor(iterations=200, depth=6, learning_rate=0.05, loss_function='RMSE')

# Créer le pipeline
pipeline = Pipeline([
        ('preprocessor', preprocessor), 
        ('model', model)
])

# Entraîner le modèle sur les données d'entraînement
pipeline.fit(X_train, y_train)

# Faire des prédictions sur les données de test
y_pred = pipeline.predict(X_test)

y_pred_train = pipeline.predict(X_train)

# print(pipeline.get_best_score())

print("######## R-squared (R2) : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred))

# Évaluer les performances du modèle (par exemple, RMSE pour la régression)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)


0:	learn: 2941102.4279647	total: 3.23ms	remaining: 644ms
1:	learn: 2906233.0343167	total: 6ms	remaining: 594ms
2:	learn: 2874006.4836710	total: 8.72ms	remaining: 573ms
3:	learn: 2842830.0963544	total: 11.8ms	remaining: 578ms
4:	learn: 2814856.9937035	total: 14.3ms	remaining: 557ms
5:	learn: 2790127.1544625	total: 16.9ms	remaining: 545ms
6:	learn: 2763254.8708581	total: 19.8ms	remaining: 547ms
7:	learn: 2737401.4184366	total: 22.1ms	remaining: 530ms
8:	learn: 2718085.0547510	total: 24.5ms	remaining: 519ms
9:	learn: 2704303.1349188	total: 26.9ms	remaining: 511ms
10:	learn: 2685210.0248839	total: 29.1ms	remaining: 500ms
11:	learn: 2668670.8697206	total: 31.4ms	remaining: 492ms
12:	learn: 2652322.5771713	total: 33.7ms	remaining: 485ms
13:	learn: 2637148.5789458	total: 36.2ms	remaining: 481ms
14:	learn: 2621919.0246408	total: 38.3ms	remaining: 473ms
15:	learn: 2606207.0988165	total: 40.7ms	remaining: 468ms
16:	learn: 2594306.1305221	total: 43.2ms	remaining: 465ms
17:	learn: 2582176.1952627	

In [202]:
# Pour exporter le modèle CatBoost vers un fichier pickle
with open('../Modeling/best_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

In [None]:
# # Charger le modèle à partir du fichier pickle
# with open('chemin/vers/votre/modele_catboost.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

# # Maintenant, vous pouvez utiliser loaded_model pour faire des prédictions
# y_pred_loaded = loaded_model.predict(X_test)