### CATBOOST Model with CatBoostEncoder

In [155]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import category_encoders as ce
from catboost import CatBoostRegressor    # For regression tasks use CatBoostRegressor, for classification use CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error  # For regression tasks
import pickle
import os
from dotenv import load_dotenv
import pyodbc

In [156]:
# import dataset
df = pd.read_csv('dataset_model_gold.csv')

In [157]:
# # Load environment variables
# load_dotenv()
# username = os.getenv('DB_USER')
# password = os.getenv('DB_PASSWORD')
# server = os.getenv('DB_SERVER')
# database = os.getenv('DB_name')
# DB_Driver = os.getenv('DB_Driver')

# # Établir la connexion à votre base de données
# connection_string = f'Driver={DB_Driver};Server=tcp:{server},1433;Database={database};Uid={username};Pwd={password};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'
# conn = pyodbc.connect(connection_string)

# cursor = conn.cursor()

# query = "SELECT * FROM [dbo].[dataset_model]"

# df = pd.read_sql(query, conn)

# # Fermer la connexion après utilisation
# conn.close()

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7015 entries, 0 to 7014
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   titre                  7015 non-null   object
 1   date                   7015 non-null   object
 2   genre                  7015 non-null   object
 3   duree                  7015 non-null   int64 
 4   realisateur            7015 non-null   object
 5   distributeur           7015 non-null   object
 6   acteurs                7015 non-null   object
 7   nationalites           7015 non-null   object
 8   langue_d_origine       7015 non-null   object
 9   type_film              7015 non-null   object
 10  annee_production       7015 non-null   int64 
 11  nombre_article         7015 non-null   int64 
 12  description            6285 non-null   object
 13  film_id_allocine       7015 non-null   int64 
 14  image                  7015 non-null   object
 15  boxoffice            

In [159]:
df.columns

Index(['titre', 'date', 'genre', 'duree', 'realisateur', 'distributeur',
       'acteurs', 'nationalites', 'langue_d_origine', 'type_film',
       'annee_production', 'nombre_article', 'description', 'film_id_allocine',
       'image', 'boxoffice', 'nombre_acteurs_connus', 'acteurs_connus',
       'realisateur_connu'],
      dtype='object')

In [160]:
X = df.drop(['boxoffice','titre','date','acteurs', 'film_id_allocine','image','realisateur','acteurs_connus',
        'description', 'nombre_article'], axis=1)
y = df['boxoffice']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

In [161]:
# X = df.drop(['boxoffice','date','acteurs',
#         'description','nombre_article','Bollywood','titre', 'Sport event', 'Expérimental', 'Musical', 'Péplum', 'Divers','Erotique', 'Science fiction', 'Guerre', 'Judiciaire', 'Arts Martiaux', 'Western'], axis=1)
# y = df['boxoffice']
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

In [162]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7015 entries, 0 to 7014
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   genre                  7015 non-null   object
 1   duree                  7015 non-null   int64 
 2   distributeur           7015 non-null   object
 3   nationalites           7015 non-null   object
 4   langue_d_origine       7015 non-null   object
 5   type_film              7015 non-null   object
 6   annee_production       7015 non-null   int64 
 7   nombre_acteurs_connus  7015 non-null   int64 
 8   realisateur_connu      7015 non-null   int64 
dtypes: int64(4), object(5)
memory usage: 493.4+ KB


In [163]:
# Identifier les caractéristiques catégorielles
categorical_features = ['genre','distributeur', 'nationalites', 'langue_d_origine','type_film']

# Sélectionner automatiquement les caractéristiques numériques en filtrant les colonnes du DataFrame X
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

cbe_encoder = ce.cat_boost.CatBoostEncoder()

# Créer un pipeline pour les prétraitements
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', cbe_encoder, categorical_features)
    ])

# Créer le modèle CatBoost
model = CatBoostRegressor(iterations=200, depth=6, learning_rate=0.05, loss_function='RMSE')

# Créer le pipeline
pipeline = Pipeline([
        ('preprocessor', preprocessor), 
        ('model', model)
])

# Entraîner le modèle sur les données d'entraînement
pipeline.fit(X_train, y_train)

# Faire des prédictions sur les données de test
y_pred = pipeline.predict(X_test)

y_pred_train = pipeline.predict(X_train)

# print(pipeline.get_best_score())

print("######## R-squared (R2) : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred))

# Évaluer les performances du modèle (par exemple, RMSE pour la régression)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)
y_pred


0:	learn: 2587388.2412188	total: 1.87ms	remaining: 372ms
1:	learn: 2555835.3703623	total: 3.55ms	remaining: 351ms
2:	learn: 2525060.4656005	total: 5.05ms	remaining: 332ms
3:	learn: 2496474.9142517	total: 6.26ms	remaining: 307ms
4:	learn: 2465634.2163603	total: 7.95ms	remaining: 310ms
5:	learn: 2441976.8283534	total: 9.33ms	remaining: 302ms
6:	learn: 2417895.8019514	total: 10.8ms	remaining: 298ms
7:	learn: 2393973.7667222	total: 12.3ms	remaining: 295ms
8:	learn: 2370079.1173529	total: 13.8ms	remaining: 293ms
9:	learn: 2353186.8733068	total: 15.5ms	remaining: 294ms
10:	learn: 2336861.4017286	total: 17.2ms	remaining: 296ms
11:	learn: 2320778.1719538	total: 18.9ms	remaining: 296ms
12:	learn: 2303861.2636827	total: 20.7ms	remaining: 298ms
13:	learn: 2287894.8650615	total: 22.9ms	remaining: 304ms
14:	learn: 2271196.0366354	total: 24.7ms	remaining: 304ms
15:	learn: 2257978.8307658	total: 26.4ms	remaining: 304ms
16:	learn: 2245753.6591597	total: 27.9ms	remaining: 301ms
17:	learn: 2234178.07123

171:	learn: 1792803.2795040	total: 304ms	remaining: 49.5ms
172:	learn: 1791678.2092966	total: 306ms	remaining: 47.8ms
173:	learn: 1790577.5545648	total: 308ms	remaining: 46ms
174:	learn: 1789449.5892491	total: 309ms	remaining: 44.2ms
175:	learn: 1788241.0398822	total: 311ms	remaining: 42.4ms
176:	learn: 1785911.4967534	total: 313ms	remaining: 40.6ms
177:	learn: 1785036.7740348	total: 315ms	remaining: 38.9ms
178:	learn: 1783623.8139910	total: 316ms	remaining: 37.1ms
179:	learn: 1782145.0958295	total: 318ms	remaining: 35.3ms
180:	learn: 1781067.2142716	total: 319ms	remaining: 33.5ms
181:	learn: 1780248.3812871	total: 321ms	remaining: 31.7ms
182:	learn: 1779104.4965336	total: 322ms	remaining: 29.9ms
183:	learn: 1777768.8638060	total: 324ms	remaining: 28.1ms
184:	learn: 1776059.7583917	total: 325ms	remaining: 26.4ms
185:	learn: 1773462.8410616	total: 327ms	remaining: 24.6ms
186:	learn: 1771963.7156901	total: 329ms	remaining: 22.9ms
187:	learn: 1770763.1347837	total: 330ms	remaining: 21.1ms

array([ 8.01831891e+06,  3.63551936e+04,  7.64610787e+05, ...,
        7.99369892e+04, -3.67814183e+04,  5.90020933e+03])

In [164]:
import joblib

joblib.dump(pipeline, 'best_model.joblib')
joblib.dump(pipeline, '../API/best_model.joblib')

['../API/best_model.joblib']

In [165]:
# # Pour exporter le modèle CatBoost vers un fichier pickle
# with open('../Modeling/best_model.pkl', 'wb') as f:
#     pickle.dump(pipeline, f)

In [166]:
# # Charger le modèle à partir du fichier pickle
# with open('chemin/vers/votre/modele_catboost.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

# # Maintenant, vous pouvez utiliser loaded_model pour faire des prédictions
# y_pred_loaded = loaded_model.predict(X_test)