### CATBOOST Model with CatBoostEncoder

In [118]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import category_encoders as ce
from catboost import CatBoostRegressor    # For regression tasks use CatBoostRegressor, for classification use CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error  # For regression tasks
import pickle
import os
from dotenv import load_dotenv
import pyodbc

In [119]:
# import dataset
df = pd.read_csv('dataset_model_gold.csv')

In [120]:
# # Load environment variables
# load_dotenv()
# username = os.getenv('DB_USER')
# password = os.getenv('DB_PASSWORD')
# server = os.getenv('DB_SERVER')
# database = os.getenv('DB_name')
# DB_Driver = os.getenv('DB_Driver')

# # Établir la connexion à votre base de données
# connection_string = f'Driver={DB_Driver};Server=tcp:{server},1433;Database={database};Uid={username};Pwd={password};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'
# conn = pyodbc.connect(connection_string)

# cursor = conn.cursor()

# query = "SELECT * FROM [dbo].[dataset_model]"

# df = pd.read_sql(query, conn)

# # Fermer la connexion après utilisation
# conn.close()

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7015 entries, 0 to 7014
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   titre                  7015 non-null   object
 1   date                   7015 non-null   object
 2   genre                  7015 non-null   object
 3   duree                  7015 non-null   int64 
 4   realisateur            7015 non-null   object
 5   distributeur           7015 non-null   object
 6   acteurs                7015 non-null   object
 7   nationalites           7015 non-null   object
 8   langue_d_origine       7015 non-null   object
 9   type_film              7015 non-null   object
 10  annee_production       7015 non-null   int64 
 11  nombre_article         7015 non-null   int64 
 12  description            6285 non-null   object
 13  film_id_allocine       7015 non-null   int64 
 14  image                  7015 non-null   object
 15  boxoffice            

In [122]:
df.columns

Index(['titre', 'date', 'genre', 'duree', 'realisateur', 'distributeur',
       'acteurs', 'nationalites', 'langue_d_origine', 'type_film',
       'annee_production', 'nombre_article', 'description', 'film_id_allocine',
       'image', 'boxoffice', 'nombre_acteurs_connus', 'acteurs_connus',
       'realisateur_connu', 'Erotique', 'Comédie musicale', 'Thriller',
       'Comédie', 'Historique', 'Divers', 'Epouvante', 'horreur',
       'Comédie dramatique', 'Animation', 'Bollywood', 'Aventure', 'Western',
       'Famille', 'Romance', 'Guerre', 'Judiciaire', 'Policier', 'Péplum',
       'Expérimental', 'Biopic', 'Musical', 'Sport event', 'Action',
       'Espionnage', 'Arts Martiaux', 'Drame', 'Science fiction',
       'Fantastique'],
      dtype='object')

In [123]:
X = df.drop(['boxoffice','titre','date','acteurs','genre', 'film_id_allocine','image','realisateur','acteurs_connus',
        'description', 'nombre_article'], axis=1)
y = df['boxoffice']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

In [124]:
# X = df.drop(['boxoffice','date','acteurs',
#         'description','nombre_article','Bollywood','titre', 'Sport event', 'Expérimental', 'Musical', 'Péplum', 'Divers','Erotique', 'Science fiction', 'Guerre', 'Judiciaire', 'Arts Martiaux', 'Western'], axis=1)
# y = df['boxoffice']
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

In [125]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7015 entries, 0 to 7014
Data columns (total 37 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   duree                  7015 non-null   int64 
 1   distributeur           7015 non-null   object
 2   nationalites           7015 non-null   object
 3   langue_d_origine       7015 non-null   object
 4   type_film              7015 non-null   object
 5   annee_production       7015 non-null   int64 
 6   nombre_acteurs_connus  7015 non-null   int64 
 7   realisateur_connu      7015 non-null   int64 
 8   Erotique               7015 non-null   int64 
 9   Comédie musicale       7015 non-null   int64 
 10  Thriller               7015 non-null   int64 
 11  Comédie                7015 non-null   int64 
 12  Historique             7015 non-null   int64 
 13  Divers                 7015 non-null   int64 
 14  Epouvante              7015 non-null   int64 
 15  horreur              

In [126]:
# Identifier les caractéristiques catégorielles
categorical_features = ['distributeur', 'nationalites', 'langue_d_origine','type_film']

# Sélectionner automatiquement les caractéristiques numériques en filtrant les colonnes du DataFrame X
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

cbe_encoder = ce.cat_boost.CatBoostEncoder()

# Créer un pipeline pour les prétraitements
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', cbe_encoder, categorical_features)
    ])

# Créer le modèle CatBoost
model = CatBoostRegressor(iterations=200, depth=6, learning_rate=0.05, loss_function='RMSE')

# Créer le pipeline
pipeline = Pipeline([
        ('preprocessor', preprocessor), 
        ('model', model)
])

# Entraîner le modèle sur les données d'entraînement
pipeline.fit(X_train, y_train)

# Faire des prédictions sur les données de test
y_pred = pipeline.predict(X_test)

y_pred_train = pipeline.predict(X_train)

# print(pipeline.get_best_score())

print("######## R-squared (R2) : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred))

# Évaluer les performances du modèle (par exemple, RMSE pour la régression)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)
y_pred


0:	learn: 2586610.1757456	total: 2.05ms	remaining: 408ms
1:	learn: 2545653.0895540	total: 4.11ms	remaining: 407ms
2:	learn: 2509470.0169095	total: 5.93ms	remaining: 390ms
3:	learn: 2476205.2590030	total: 8.14ms	remaining: 399ms
4:	learn: 2445731.2672962	total: 10.2ms	remaining: 397ms
5:	learn: 2419365.7891386	total: 12.1ms	remaining: 392ms
6:	learn: 2391441.3785849	total: 14.2ms	remaining: 390ms
7:	learn: 2366191.4421480	total: 16.4ms	remaining: 393ms
8:	learn: 2343600.0937076	total: 18.2ms	remaining: 387ms
9:	learn: 2319917.0054840	total: 20.3ms	remaining: 386ms
10:	learn: 2299658.9929215	total: 22.7ms	remaining: 389ms
11:	learn: 2279435.3757823	total: 24.9ms	remaining: 391ms
12:	learn: 2262509.1451081	total: 27.5ms	remaining: 395ms
13:	learn: 2245077.8794738	total: 29.4ms	remaining: 391ms
14:	learn: 2229835.2800154	total: 31.4ms	remaining: 387ms
15:	learn: 2214027.4026939	total: 33.4ms	remaining: 384ms
16:	learn: 2198517.7694230	total: 35.4ms	remaining: 381ms
17:	learn: 2188579.01718

array([5674107.98012263,   33337.11976603,  452059.96219798, ...,
         85927.99636622,  -49157.81555298,  -13417.32664899])

In [130]:
import joblib

joblib.dump(pipeline, 'best_model.joblib')
joblib.dump(pipeline, '../API/best_model.joblib')

['../API/best_model.joblib']

In [128]:
# # Pour exporter le modèle CatBoost vers un fichier pickle
# with open('../Modeling/best_model.pkl', 'wb') as f:
#     pickle.dump(pipeline, f)

In [129]:
# # Charger le modèle à partir du fichier pickle
# with open('chemin/vers/votre/modele_catboost.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

# # Maintenant, vous pouvez utiliser loaded_model pour faire des prédictions
# y_pred_loaded = loaded_model.predict(X_test)