In [13]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import pandas as pd

df = pd.read_parquet("allocine_spider_clean.parquet")
df = df[df['french_first_week_boxoffice'].notna()]
df.head()

Unnamed: 0,actors,critics_score,date,director,editor,genre,langage,length,nationality,viewers_score,french_first_week_boxoffice,french_visa,title,vo_title,url
1,"[Margot Nuccetelli, Dave Willetts, Rick Kavanian]",2.4,2022-07-20,Adam Gunn,Alba Films,"[Animation, Famille, Fantastique]",[Anglais],82.0,"[Allemagne, Australie, Belgique, Inde]",2.5,42006.0,157299,"Mia et moi, L’Héroïne de Centopia",Mia And Me - The Movie,/article/fichearticle_gen_carticle=18711883.html
2,"[Éric Nantchouang, Salif Cissé, Édouard Sulpice]",3.9,2021-07-21,Guillaume Brac,Arte,[Comédie],[Français],95.0,[France],3.7,5612.0,149445,À l’abordage,,/article/fichearticle_gen_carticle=18706984.html
3,"[Anthony Hopkins, Olivia Colman, Mark Gatiss]",3.9,2021-05-26,Florian Zeller,Orange Studio Distribution / UGC Distribution,[Drame],[Anglais],98.0,"[Grande-Bretagne, France]",4.3,153927.0,151195,The Father,,/article/fichearticle_gen_carticle=1000021987....
4,"[Josiane Balasko, Didier Bourdon, Marilou Berry]",2.8,2021-12-15,Alexandra Leclère,UGC Distribution,[Comédie],[Français],95.0,[France],2.6,198247.0,-,Mes très chers enfants,,/article/fichearticle_gen_carticle=18708877.html
5,"[Virginie Efira, Albert Dupontel, Nicolas Marié]",3.7,2020-10-21,Albert Dupontel,Gaumont Distribution,[Comédie],[Français],87.0,[France],3.9,600444.0,150859,Adieu Les Cons,,/video/player_gen_cmedia=19589300&cfilm=274345...


In [14]:
features_of_interest = [
    'actors',
    'critics_score',
    'date',
    'director',
    'editor',
    'genre',
    'langage',
    'length',
    'nationality',
    'viewers_score'
]

info_film = ['french_visa', 'title', 'vo_title', 'url']

numerical_features = ['critics_score', 'length', 'viewers_score']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['actors', 'genre', 'langage', 'nationality']


list_categorical_features = ['actors', 'genre', 'langage', 'nationality']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])

target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
import numpy as np

class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlbs = {}  # Stocke un MultiLabelBinarizer pour chaque colonne
    
    def fit(self, X, y=None):
        for col in X.columns:
            self.mlbs[col] = MultiLabelBinarizer()
            self.mlbs[col].fit(X[col])
        return self
    
    def transform(self, X):
        transformed_list = []
        for col in X.columns:
            transformed = self.mlbs[col].transform(X[col])
            new_columns = [f"{col}_{label}" for label in self.mlbs[col].classes_]
            transformed_list.append(pd.DataFrame(transformed, columns=new_columns, index=X.index))
        
        return pd.concat(transformed_list, axis=1)
    
    def get_feature_names_out(self, input_features=None):
        # Collecter tous les noms de colonnes de sortie
        feature_names = []
        for col in self.mlbs.keys():
            feature_names.extend([f"{col}_{label}" for label in self.mlbs[col].classes_])
        return np.array(feature_names)

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Bourin... à changer mais j'ai même pas vérifier s'il manquait des données...
    ('scaler', StandardScaler())
])

class CustomDateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_names_out = ['year', 'month', 'day', 'dayofweek']
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        result = pd.DataFrame({
            'year': X['date'].dt.year,
            'month': X['date'].dt.month,
            'day': X['date'].dt.day,
            'dayofweek': X['date'].dt.dayofweek
        })
        return result
        
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_out)

date_transformer = Pipeline(steps=[
    ('date_features', CustomDateTransformer())
])

# Toujours bourin :p
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

list_categorical_transformer = Pipeline(steps=[
    ('multi_label', MultiLabelBinarizerTransformer())
])

# Création du préprocesseur (sans les colonnes déjà transformées)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

In [16]:
class ARIMAFeatureGenerator:
    def __init__(self, order=(1,1,1), seasonal_order=None, date_col='date', target_col='french_first_week_boxoffice'):
        self.order = order
        self.seasonal_order = seasonal_order
        self.date_col = date_col
        self.target_col = target_col
        self.arima_model = None
        self.trend_series = None
        
    def fit(self, X, y):
        # Combinons X et y pour la chronologie
        data = X.copy()
        data[self.target_col] = y
        
        # Triez par date
        data = data.sort_values(by=self.date_col)
        
        # Test de stationnarité
        adf_result = adfuller(data[self.target_col].fillna(0))
        print(f"ADF Statistic: {adf_result[0]}")
        print(f"p-value: {adf_result[1]}")
        
        # Création d'une série temporelle hebdomadaire
        weekly_data = data.set_index(self.date_col).resample('W')[self.target_col].mean()
        weekly_data = weekly_data.fillna(weekly_data.mean())
        
        # Entraînez le modèle ARIMA
        try:
            self.arima_model = ARIMA(
                weekly_data, 
                order=self.order,
                seasonal_order=self.seasonal_order
            ).fit()
            
            print("Modèle ARIMA entraîné avec succès")
            
            # Extraire la tendance
            self.trend_series = pd.Series(
                self.arima_model.predict(
                    start=0, 
                    end=len(weekly_data)-1
                ),
                index=weekly_data.index
            )
            
        except Exception as e:
            print(f"Erreur lors de l'entraînement du modèle ARIMA: {e}")
            # Fallback: utiliser une moyenne mobile simple
            self.trend_series = weekly_data.rolling(window=4).mean().fillna(weekly_data.mean())
            
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        
        # Créez une colonne pour la tendance ARIMA
        trend_values = []
        
        for date in X_copy[self.date_col]:
            # Trouvez la semaine correspondante
            week_start = pd.Timestamp(date).to_period('W').start_time
            
            if week_start in self.trend_series.index:
                trend_value = self.trend_series[week_start]
            else:
                # Pour les dates futures, utilisez la dernière prédiction disponible
                # ou une extrapolation si nécessaire
                if week_start > self.trend_series.index[-1]:
                    # Prédiction pour les dates futures (hors échantillon)
                    if self.arima_model:
                        try:
                            steps = (week_start - self.trend_series.index[-1]).days // 7 + 1
                            forecast = self.arima_model.forecast(steps=steps)
                            trend_value = forecast[-1]
                        except:
                            trend_value = self.trend_series.iloc[-1]  # Fallback
                    else:
                        trend_value = self.trend_series.iloc[-1]  # Fallback
                else:
                    # Pour les dates passées mais manquantes, utilisez la valeur la plus proche
                    closest_date = self.trend_series.index[np.abs(self.trend_series.index - week_start).argmin()]
                    trend_value = self.trend_series[closest_date]
            
            trend_values.append(trend_value)
        
        # Ajoutez la tendance comme feature
        X_copy['arima_trend'] = trend_values
        
        # Ajoutez la saisonnalité si disponible (mois, jour de la semaine)
        X_copy['release_month'] = X_copy[self.date_col].dt.month
        X_copy['release_dayofweek'] = X_copy[self.date_col].dt.dayofweek
        
        return X_copy

In [17]:
def build_hybrid_model(X_train, y_train, X_test, y_test):
    # Initialisation du générateur de features ARIMA
    arima_generator = ARIMAFeatureGenerator(
        order=(1, 1, 1),  # Paramètres ARIMA à ajuster selon vos données
        date_col='date',
        target_col='french_first_week_boxoffice'
    )
    
    # Génération des features ARIMA
    arima_generator.fit(X_train, y_train)
    X_train_arima = arima_generator.transform(X_train)
    X_test_arima = arima_generator.transform(X_test)
    
    # Intégrer ces nouvelles features dans votre pipeline existant
    # Ajoutez 'arima_trend' à vos features numériques
    numerical_features_extended = numerical_features + ['arima_trend', 'release_month', 'release_dayofweek']
    
    # Reconstruire le preprocessor avec ces nouvelles colonnes
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    # Attention: supprimez 'date' des colonnes car nous l'avons déjà utilisée pour ARIMA
    preprocessor_extended = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features_extended),
            ('cat', categorical_transformer, categorical_features),
            ('list', list_categorical_transformer, list_categorical_features)
        ],
        remainder='drop'  # Ignorer les autres colonnes, y compris 'date'
    )
    
    # Pipeline complet
    hybrid_model = Pipeline([
        ('preprocessor', preprocessor_extended),
        ('xgboost', xgb.XGBRegressor(
        ))
    ])
    
    # Entraînement et évaluation
    hybrid_model.fit(X_train_arima, y_train)
    y_pred = hybrid_model.predict(X_test_arima)
    
    # Métriques
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) 
    rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))  # RMSE manuel
    r2 = r2_score(y_test, y_pred)
    
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}%")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²: {r2:.4f}")
    
    return hybrid_model, arima_generator, y_pred

In [18]:
build_hybrid_model(X_train, y_train, X_test, y_test)

ADF Statistic: -23.790081840614913
p-value: 0.0
Modèle ARIMA entraîné avec succès
MAE: 84170.58
MAPE: 38.65%
RMSE: 235751.68
R²: 0.4202




(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='median')),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   ['critics_score', 'length',
                                                    'viewers_score',
                                                    'arima_trend',
                                                    'release_month',
                                                    'release_dayofweek']),
                                                  ('cat',
                                                   Pipeline(steps=[('imputer',
                                                               