### CATBOOST Model with CatBoostEncoder

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import category_encoders as ce
from catboost import CatBoostRegressor    # For regression tasks use CatBoostRegressor, for classification use CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error  # For regression tasks
import pickle
import os
from dotenv import load_dotenv
import pyodbc

In [11]:
# import dataset
df = pd.read_csv('dataset_model_gold.csv')

In [12]:
# # Load environment variables
# load_dotenv()
# username = os.getenv('DB_USER')
# password = os.getenv('DB_PASSWORD')
# server = os.getenv('DB_SERVER')
# database = os.getenv('DB_name')
# DB_Driver = os.getenv('DB_Driver')

# # Établir la connexion à votre base de données
# connection_string = f'Driver={DB_Driver};Server=tcp:{server},1433;Database={database};Uid={username};Pwd={password};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'
# conn = pyodbc.connect(connection_string)

# cursor = conn.cursor()

# query = "SELECT * FROM [dbo].[dataset_model]"

# df = pd.read_sql(query, conn)

# # Fermer la connexion après utilisation
# conn.close()

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7015 entries, 0 to 7014
Data columns (total 48 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   titre                  7015 non-null   object
 1   date                   7015 non-null   object
 2   genre                  7015 non-null   object
 3   duree                  7015 non-null   int64 
 4   realisateur            7015 non-null   object
 5   distributeur           7015 non-null   object
 6   acteurs                7015 non-null   object
 7   nationalites           7015 non-null   object
 8   langue_d_origine       7015 non-null   object
 9   type_film              7015 non-null   object
 10  annee_production       7015 non-null   int64 
 11  nombre_article         7015 non-null   int64 
 12  description            6285 non-null   object
 13  film_id_allocine       7015 non-null   int64 
 14  image                  7015 non-null   object
 15  boxoffice            

In [15]:
df.columns

Index(['titre', 'date', 'genre', 'duree', 'realisateur', 'distributeur',
       'acteurs', 'nationalites', 'langue_d_origine', 'type_film',
       'annee_production', 'nombre_article', 'description', 'film_id_allocine',
       'image', 'boxoffice', 'nombre_acteurs_connus', 'acteurs_connus',
       'realisateur_connu', 'Erotique', 'Comédie musicale', 'Thriller',
       'Comédie', 'Historique', 'Divers', 'Epouvante', 'horreur',
       'Comédie dramatique', 'Animation', 'Bollywood', 'Aventure', 'Western',
       'Famille', 'Romance', 'Guerre', 'Judiciaire', 'Policier', 'Péplum',
       'Expérimental', 'Biopic', 'Musical', 'Sport event', 'Action',
       'Espionnage', 'Arts Martiaux', 'Drame', 'Science fiction',
       'Fantastique'],
      dtype='object')

In [16]:
X = df.drop(['boxoffice','titre','date','acteurs',
        'description', 'nombre_article'], axis=1)
y = df['boxoffice']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

In [17]:
# Identifier les caractéristiques catégorielles
categorical_features = ['distributeur','realisateur', 'nationalites', 'langue_d_origine','type_film','acteurs_connus']

# Sélectionner automatiquement les caractéristiques numériques en filtrant les colonnes du DataFrame X
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

cbe_encoder = ce.cat_boost.CatBoostEncoder()

# Créer un pipeline pour les prétraitements
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', cbe_encoder, categorical_features)
    ])

# Créer le modèle CatBoost
model = CatBoostRegressor(iterations=200, depth=6, learning_rate=0.05, loss_function='RMSE')

# Créer le pipeline
pipeline = Pipeline([
        ('preprocessor', preprocessor), 
        ('model', model)
])

# Entraîner le modèle sur les données d'entraînement
pipeline.fit(X_train, y_train)

# Faire des prédictions sur les données de test
y_pred = pipeline.predict(X_test)

y_pred_train = pipeline.predict(X_train)

# print(pipeline.get_best_score())

print("######## R-squared (R2) : ")
print("TRAIN :", r2_score(y_train, y_pred_train))
print("TEST :", r2_score(y_test, y_pred))

# Évaluer les performances du modèle (par exemple, RMSE pour la régression)
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)


0:	learn: 2587815.4481257	total: 53ms	remaining: 10.6s
1:	learn: 2552342.3650796	total: 55ms	remaining: 5.45s
2:	learn: 2517546.9083884	total: 57.2ms	remaining: 3.75s
3:	learn: 2485332.2947660	total: 59.1ms	remaining: 2.9s
4:	learn: 2450971.7248216	total: 61.3ms	remaining: 2.39s
5:	learn: 2421712.5821455	total: 63.1ms	remaining: 2.04s
6:	learn: 2400991.8522011	total: 65.3ms	remaining: 1.8s
7:	learn: 2375412.0441729	total: 67.5ms	remaining: 1.62s
8:	learn: 2354377.2351015	total: 70.4ms	remaining: 1.49s
9:	learn: 2333514.2607419	total: 72.8ms	remaining: 1.38s
10:	learn: 2311595.1882966	total: 74.9ms	remaining: 1.29s
11:	learn: 2293397.7771794	total: 77.1ms	remaining: 1.21s
12:	learn: 2278391.7748026	total: 79.3ms	remaining: 1.14s
13:	learn: 2263144.7082477	total: 82ms	remaining: 1.09s
14:	learn: 2247484.8443490	total: 84ms	remaining: 1.04s
15:	learn: 2234691.8017452	total: 86.3ms	remaining: 993ms
16:	learn: 2218025.2469486	total: 88.6ms	remaining: 954ms
17:	learn: 2207080.8459399	total: 

In [18]:
# Pour exporter le modèle CatBoost vers un fichier pickle
with open('../Modeling/best_model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

In [None]:
# # Charger le modèle à partir du fichier pickle
# with open('chemin/vers/votre/modele_catboost.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

# # Maintenant, vous pouvez utiliser loaded_model pour faire des prédictions
# y_pred_loaded = loaded_model.predict(X_test)