In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pandas as pd

df = pd.read_parquet("allocine_spider_clean.parquet")
df = df[df['french_first_week_boxoffice'].notna()]
df = df[df['date'].notna()]
df.head()

Unnamed: 0,actors,critics_score,date,director,editor,genre,langage,length,nationality,viewers_score,french_first_week_boxoffice,french_visa,title,vo_title,url
0,"[George Clooney, Julia Roberts, Jack O'Connell]",3.5,2016-05-12,Jodie Foster,Sony Pictures Releasing France,[Thriller],[Anglais],99.0,[U.S.A.],3.6,305385.0,144018,Money Monster,,/video/player_gen_cmedia=19561526&cfilm=214139...
1,"[Kad Merad, Géraldine Pailhas, Lola Creton]",2.7,2015-01-21,Christophe Lamotte,Rezo Films,"[Drame, Thriller]",[Français],100.0,[France],2.3,29265.0,117439,Disparue en hiver,,/article/fichearticle_gen_carticle=18639324.html
2,"[Nicolas Cage, Kev Adams, Ryan Reynolds]",3.8,2013-04-10,Chris Sanders,Twentieth Century Fox France,"[Aventure, Animation, Comédie, Famille]",[Anglais],98.0,[U.S.A.],3.9,501465.0,135882,Les Croods,The Croods,/video/player_gen_cmedia=19410286&cfilm=146916...
3,"[Dylan O'Brien, Michael Keaton, Taylor Kitsch]",2.3,2017-09-20,Michael Cuesta,Metropolitan FilmExport,"[Action, Thriller]",[Anglais],112.0,[U.S.A.],3.2,104402.0,147308,American Assassin,,/video/player_gen_cmedia=19569927&cfilm=194970...
4,"[Virginie Efira, Anaïs Demoustier, Laurent Sto...",3.5,2015-04-22,Emmanuel Mouret,Pyramide Distribution,"[Comédie, Romance]",[Français],100.0,[France],2.7,64046.0,138835,Caprice,,/diaporamas/cinema/diaporama-18643599/


In [23]:
df.dtypes

actors                                 object
critics_score                         float64
date                           datetime64[ns]
director                               object
editor                                 object
genre                                  object
langage                                object
length                                float64
nationality                            object
viewers_score                         float64
french_first_week_boxoffice           float64
french_visa                            object
title                                  object
vo_title                               object
url                                    object
dtype: object

In [24]:
df_afluence = pd.read_parquet("nationnal_afluence.parquet")
df_afluence.set_index('month')
df_afluence.dtypes

month         datetime64[ns]
box_office           float64
dtype: object

In [25]:
df_afluence.head()

Unnamed: 0,month,box_office
57,2009-01-01,15.148333
41,2009-02-01,19.348
121,2009-03-01,20.0875
16,2009-04-01,16.975
105,2009-05-01,14.528333


In [26]:
df_test=df.head()

In [27]:
df_afluence = pd.read_parquet("nationnal_afluence_filled.parquet")
df_afluence

Unnamed: 0,month,box_office
0,2009-01-01,15.148333
1,2009-02-01,19.348000
2,2009-03-01,20.087500
3,2009-04-01,16.975000
4,2009-05-01,14.528333
...,...,...
190,2024-11-01,16.405000
191,2024-12-01,19.145000
192,2025-01-01,13.820000
193,2025-02-01,14.495000


In [28]:
# df_afluence = pd.read_parquet("nationnal_afluence_filled.parquet")

def put_national_affluence(x):
    df_afluence = pd.read_parquet("nationnal_afluence.parquet")
    df_afluence['month'] = pd.to_datetime(df_afluence['month'])
    df_afluence.set_index('month', inplace=True)
    
    date = pd.to_datetime(f"{x.year}-{x.month:02d}-01")
    
    return df_afluence.loc[date, 'box_office'] if date in df_afluence.index else None

In [29]:
df['national_affluence']= df['date'].apply(put_national_affluence)

In [30]:
df.head(1)

Unnamed: 0,actors,critics_score,date,director,editor,genre,langage,length,nationality,viewers_score,french_first_week_boxoffice,french_visa,title,vo_title,url,national_affluence
0,"[George Clooney, Julia Roberts, Jack O'Connell]",3.5,2016-05-12,Jodie Foster,Sony Pictures Releasing France,[Thriller],[Anglais],99.0,[U.S.A.],3.6,305385.0,144018,Money Monster,,/video/player_gen_cmedia=19561526&cfilm=214139...,14.188889


In [31]:
list_categorical_features = ['genre', 'langage', 'nationality', 'actors']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])
    
df['french_prod']= df['nationality'].apply(lambda x: 1 if "France" in x else 0)
df['usa_prod']= df['nationality'].apply(lambda x: 1 if "U.S.A." in x else 0)


list_categorical_features = ['genre', 'langage']
features_of_interest = [
    'french_prod',
    'date',
    'director',
    'editor',
    'genre',
    'langage',
    'length',
    'usa_prod',
    'national_affluence'
]

info_film = ['french_visa', 'title', 'vo_title', 'url']

numerical_features = ['length', 'national_affluence']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['genre', 'langage']


target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import numpy as np

class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlbs = {}  # Stocke un MultiLabelBinarizer pour chaque colonne
    
    def fit(self, X, y=None):
        for col in X.columns:
            self.mlbs[col] = MultiLabelBinarizer()
            self.mlbs[col].fit(X[col])
        return self
    
    def transform(self, X):
        transformed_list = []
        for col in X.columns:
            transformed = self.mlbs[col].transform(X[col])
            new_columns = [f"{col}_{label}" for label in self.mlbs[col].classes_]
            transformed_list.append(pd.DataFrame(transformed, columns=new_columns, index=X.index))
        
        return pd.concat(transformed_list, axis=1)
    
    def get_feature_names_out(self, input_features=None):
        # Collecter tous les noms de colonnes de sortie
        feature_names = []
        for col in self.mlbs.keys():
            feature_names.extend([f"{col}_{label}" for label in self.mlbs[col].classes_])
        return np.array(feature_names)

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Bourin... à changer mais j'ai même pas vérifier s'il manquait des données...
    ('scaler', StandardScaler())
])

class CustomDateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_names_out = ['year', 'month', 'day', 'dayofweek']
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        result = pd.DataFrame({
            'year': X['date'].dt.year,
            'month': X['date'].dt.month,
            'day': X['date'].dt.day,
            'dayofweek': X['date'].dt.dayofweek
        })
        return result
        
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_out)

date_transformer = Pipeline(steps=[
    ('date_features', CustomDateTransformer())
])

# Toujours bourin :p
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

list_categorical_transformer = Pipeline(steps=[
    ('multi_label', MultiLabelBinarizerTransformer())
])

# Création du préprocesseur (sans les colonnes déjà transformées)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

In [33]:
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

xgboost_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('xgboost', xgb.XGBRegressor())
])

xgboost_model.fit(X_train,y_train)
y_pred = xgboost_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

MAE: 123347.10883381614
RMSE: 500664.9246767122
R²: 0.735939865412149


