In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.preprocessing import PolynomialFeatures

In [15]:
df = pd.read_parquet("allocine_spider_clean.parquet")
df.head(5)

Unnamed: 0,actors,critics_score,date,directors,editor,genre,langage,length,nationality,viewers_score,us_first_week_boxoffice,french_first_week_boxoffice,french_visa,title,vo_title,url
0,"[Arieh Worthalter, Arthur Harari, Stéphan Guér...",4.4,2023-09-27,[Cédric Kahn],Ad Vitam,"[Policier, Drame, Historique, Judiciaire]",[Français],116.0,[France],3.9,,110011.0,157303,Le Procès Goldman,,/article/fichearticle_gen_carticle=1000093547....
1,"[Monica Bellucci, Vincent Cassel, Albert Dupon...",3.5,2020-08-26,[Gaspar Noé],Carlotta Films,"[Drame, Thriller]","[Anglais, Français, Italien, Espagnol]",90.0,[France],,,2905.0,153336,Irréversible - Inversion Intégrale,,/article/fichearticle_gen_carticle=18692477.html
2,[Thom Hoffman],2.8,2024-08-07,"[Richard Claus, Karsten Kiilerich]",Le Pacte,"[Aventure, Animation, Comédie, Famille]",[Néerlandais],84.0,"[Danemark, France, Allemagne, Pays-Bas]",3.0,,39119.0,160622,Petit Panda en Afrique,Panda Bear in Africa,/article/fichearticle_gen_carticle=1000096064....
3,"[Lou de Laâge, Raphaël Personnaz, Isabelle Car...",3.3,2022-12-21,[Olivier Treiner],SND,[Drame],[Français],120.0,[France],3.8,,64039.0,152607,Le Tourbillon de la vie,,/article/fichearticle_gen_carticle=1000006465....
4,"[Michael B. Jordan, Jamie Foxx, Brie Larson, R...",3.0,2020-01-29,[Destin Daniel Cretton],Warner Bros. France,"[Biopic, Drame]",[Anglais],137.0,[U.S.A.],4.1,9713228.0,113153.0,152118,La Voie de la justice,Just Mercy,/video/player_gen_cmedia=19586793&cfilm=239735...


In [5]:
df.isnull().sum()

actors                          151
critics_score                    22
date                              0
director                         15
editor                            0
genre                             0
langage                           0
length                           10
nationality                       6
viewers_score                   482
french_first_week_boxoffice       0
french_visa                       0
title                             0
vo_title                       4770
url                               0
dtype: int64

In [16]:
features_of_interest = [
    'actors',
    'date',
    'directors',
    'editor',
    'genre',
    'langage',
    'length',
    'nationality'
]

info_film = ['french_visa', 'title', 'vo_title', 'url']

numerical_features = ['length']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['actors', 'genre', 'langage', 'nationality']


list_categorical_features = ['actors', 'genre', 'langage', 'nationality']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])

target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

In [17]:
class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlbs = {}  # Stocke un MultiLabelBinarizer pour chaque colonne
    
    def fit(self, X, y=None):
        for col in X.columns:
            self.mlbs[col] = MultiLabelBinarizer()
            self.mlbs[col].fit(X[col])
        return self
    
    def transform(self, X):
        transformed_list = []
        for col in X.columns:
            transformed = self.mlbs[col].transform(X[col])
            new_columns = [f"{col}_{label}" for label in self.mlbs[col].classes_]
            transformed_list.append(pd.DataFrame(transformed, columns=new_columns, index=X.index))
        
        return pd.concat(transformed_list, axis=1)
    
    def get_feature_names_out(self, input_features=None):
        # Collecter tous les noms de colonnes de sortie
        feature_names = []
        for col in self.mlbs.keys():
            feature_names.extend([f"{col}_{label}" for label in self.mlbs[col].classes_])
        return np.array(feature_names)

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Bourin... à changer mais j'ai même pas vérifier s'il manquait des données...
    ('scaler', StandardScaler())
])

class CustomDateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_names_out = ['year', 'month', 'day', 'dayofweek']
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        result = pd.DataFrame({
            'year': X['date'].dt.year,
            'month': X['date'].dt.month,
            'day': X['date'].dt.day,
            'dayofweek': X['date'].dt.dayofweek
        })
        return result
        
    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_out)

date_transformer = Pipeline(steps=[
    ('date_features', CustomDateTransformer())
])

# Toujours bourin :p
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

list_categorical_transformer = Pipeline(steps=[
    ('multi_label', MultiLabelBinarizerTransformer())
])

# Création du préprocesseur (sans les colonnes déjà transformées)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

In [38]:
linear_regression_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('linearregressoin', LinearRegression())
])

linear_regression_model.fit(X_train,y_train)
y_pred = linear_regression_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAPE: {mape} %")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

MAE: 563175.7918972133
MAPE: 461.3643001980008 %
RMSE: 945694.8043167448
R²: 0.05787069546565793




In [9]:
list_categorical_features = ['genre', 'langage', 'nationality', 'actors']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])
    
df['french_prod']= df['nationality'].apply(lambda x: 1 if "France" in x else 0)
df['usa_prod']= df['nationality'].apply(lambda x: 1 if "U.S.A." in x else 0)


list_categorical_features = ['genre', 'langage']
features_of_interest = [
    'french_prod',
    'date',
    'director',
    'editor',
    'genre',
    'langage',
    'length',
    'usa_prod',
]

info_film = ['french_visa', 'title', 'vo_title', 'url']

numerical_features = ['length']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['genre', 'langage']


target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

linear_regression_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('linearregressoin', LinearRegression())
])

linear_regression_model.fit(X_train,y_train)
y_pred = linear_regression_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAPE: {mape} %")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

MAE: 264972.39873315766
MAPE: 263.1591880094103 %
RMSE: 591416.6320177614
R²: 0.6315355659804622




In [10]:
#Lasso

list_categorical_features = ['genre', 'langage', 'nationality', 'actors']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])
    
df['french_prod']= df['nationality'].apply(lambda x: 1 if "France" in x else 0)
df['usa_prod']= df['nationality'].apply(lambda x: 1 if "U.S.A." in x else 0)


list_categorical_features = ['genre', 'langage']
features_of_interest = [
    'french_prod',
    'date',
    'director',
    'editor',
    'genre',
    'langage',
    'length',
    'usa_prod',
]

info_film = ['french_visa', 'title', 'vo_title', 'url']

numerical_features = ['length']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['genre', 'langage']


target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

lasso_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('lassomodel', Lasso(alpha=10, random_state=42))
])

lasso_model.fit(X_train,y_train)
y_pred = lasso_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAPE: {mape} %")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

MAE: 174586.29661733532
MAPE: 116.66422190675844 %
RMSE: 540647.6839939922
R²: 0.6920804946973367




In [28]:
# Ridge

list_categorical_features = ['genre', 'langage', 'nationality', 'actors']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])
    
df['french_prod']= df['nationality'].apply(lambda x: 1 if "France" in x else 0)
df['usa_prod']= df['nationality'].apply(lambda x: 1 if "U.S.A." in x else 0)


list_categorical_features = ['genre', 'langage']
features_of_interest = [
    'french_prod',
    'date',
    'director',
    'editor',
    'genre',
    'langage',
    'length',
    'usa_prod',
]

info_film = ['french_visa', 'title', 'vo_title', 'url']

numerical_features = ['length']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['genre', 'langage']


target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

ridge_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('rigdemodel', Ridge())
])

ridge_model.fit(X_train,y_train)
y_pred = ridge_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAPE: {mape} %")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

MAE: 173849.23317083772
MAPE: 111.79013991793681 %
RMSE: 698292.670654377
R²: 0.48633086151555704




In [29]:
# Elasticnet

list_categorical_features = ['genre', 'langage', 'nationality', 'actors']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])
    
df['french_prod']= df['nationality'].apply(lambda x: 1 if "France" in x else 0)
df['usa_prod']= df['nationality'].apply(lambda x: 1 if "U.S.A." in x else 0)


list_categorical_features = ['genre', 'langage']
features_of_interest = [
    'french_prod',
    'date',
    'director',
    'editor',
    'genre',
    'langage',
    'length',
    'usa_prod',
]

info_film = ['french_visa', 'title', 'vo_title', 'url']

numerical_features = ['length']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['genre', 'langage']


target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

elastic_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('elasticnet', ElasticNet(alpha=1))
])

elastic_model.fit(X_train,y_train)
y_pred = elastic_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAPE: {mape} %")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

MAE: 186919.11723180313
MAPE: 127.04269509226741 %
RMSE: 956465.8093908563
R²: 0.036287689940789125




In [31]:
# Grille des valeurs de alpha à tester
param_grid = {
    'lassomodel__alpha': np.logspace(-3, 2, 20)  # de 0.001 à 100
}

# Pipeline complet (repris de ton code)
lasso_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('lassomodel', Lasso(random_state=42))
])

# GridSearch
grid_search = GridSearchCV(
    lasso_model,
    param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# Fit
grid_search.fit(X_train, y_train)

# Meilleur modèle
best_lasso = grid_search.best_estimator_
print(f"Meilleur alpha : {grid_search.best_params_['lassomodel__alpha']}")
print(f"Score MAE (cross-val) : {-grid_search.best_score_}")

# Évaluation sur le test set
y_pred = best_lasso.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n--- Évaluation finale sur le test set ---")
print(f"MAE: {mae}")
print(f"MAPE: {mape * 100:.2f} %")
print(f"RMSE: {rmse}") 
print(f"R²: {r2}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Meilleur alpha : 0.001
Score MAE (cross-val) : nan

--- Évaluation finale sur le test set ---
MAE: 268239.9141353349
MAPE: 24906.97 %
RMSE: 593319.1631020171
R²: 0.6291611228196736


  model = cd_fast.sparse_enet_coordinate_descent(


In [None]:

# Pipeline
ridge_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('ridge_model', Ridge(random_state=42))
])

# Grille de recherche (valide pour Ridge avec sparse)
param_grid = {
    'ridge_model__alpha': [1, 10, 100, 290, 500],
    'ridge_model__solver': ['lsqr', 'sparse_cg', 'sag', 'saga'],  # on retire 'svd' ici pour éviter les soucis
    'ridge_model__fit_intercept': [True, False],
}

# GridSearchCV
grid_search = GridSearchCV(
    ridge_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=1
)

# Fit
grid_search.fit(X_train, y_train)

# Meilleur modèle
best_ridge = grid_search.best_estimator_
best_params = grid_search.best_params_

print(f"✅ Meilleur alpha : {best_params['ridge_model__alpha']}")
print(f"✅ Meilleur solver : {best_params['ridge_model__solver']}")
print(f"✅ Meilleur fit_intercept : {best_params['ridge_model__fit_intercept']}")
print(f"Score MAE (cross-val) : {-grid_search.best_score_:.2f}")

# Évaluation sur test set
y_pred = best_ridge.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n--- 📊 Évaluation sur le test set ---")
print(f"MAE: {mae:.2f}")
print(f"MAPE: {mape * 100:.2f} %")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")


Fitting 5 folds for each of 40 candidates, totalling 200 fits


60 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/home/utilisateur/Documents/cinema/new_is_always_better/modelisation/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/utilisateur/Documents/cinema/new_is_always_better/modelisation/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/utilisateur/Documents/cinema/new_is_always_better/modelisation/.venv/lib/python3.12/site

✅ Meilleur alpha : 1
✅ Meilleur solver : lsqr
✅ Meilleur fit_intercept : True
Score MAE (cross-val) : nan

--- 📊 Évaluation sur le test set ---
MAE: 174012.37
MAPE: 11031.66 %
RMSE: 707366.62
R²: 0.4729




In [37]:
# Pipeline avec ElasticNet
elastic_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('elasticnet', ElasticNet(random_state=42, max_iter=10000))
])

# Grille des hyperparamètres à tester
param_grid = {
    'elasticnet__alpha': [0.01, 0.1, 1, 10, 100],
    'elasticnet__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9, 1],
    'elasticnet__fit_intercept': [True, False]
}

# GridSearch
grid_search = GridSearchCV(
    elastic_model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',  # ou 'r2', 'neg_root_mean_squared_error' etc.
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Fit du GridSearch
grid_search.fit(X_train, y_train)

# Meilleur modèle
best_elastic = grid_search.best_estimator_
best_params = grid_search.best_params_

print(f"\n✅ Meilleurs paramètres :")
print(f"Alpha : {best_params['elasticnet__alpha']}")
print(f"L1_ratio : {best_params['elasticnet__l1_ratio']}")
print(f"Fit_intercept : {best_params['elasticnet__fit_intercept']}")
print(f"Score MAE (cross-val) : {-grid_search.best_score_:.2f}")

Fitting 5 folds for each of 60 candidates, totalling 300 fits


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
60 fits failed out of a total of 300.
The score on these train-test partitions for these parameters 


✅ Meilleurs paramètres :
Alpha : 0.01
L1_ratio : 0.1
Fit_intercept : True
Score MAE (cross-val) : nan


Lasso seems to be the best model of all linear model.

In [17]:
df.isnull().sum()

actors                            0
critics_score                    22
date                              0
director                         15
editor                            0
genre                             0
langage                           0
length                           10
nationality                       0
viewers_score                   482
french_first_week_boxoffice       0
french_visa                       0
title                             0
vo_title                       4770
url                               0
french_prod                       0
usa_prod                          0
dtype: int64

In [18]:
#Lasso

list_categorical_features = ['genre', 'langage', 'nationality', 'actors']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])
    
df['french_prod']= df['nationality'].apply(lambda x: 1 if "France" in x else 0)
df['usa_prod']= df['nationality'].apply(lambda x: 1 if "U.S.A." in x else 0)


list_categorical_features = ['genre', 'langage']
features_of_interest = [
    'french_prod',
    'date',
    'directors',
    'editor',
    'genre',
    'langage',
    'length',
    'usa_prod',
]

info_film = ['french_visa', 'title', 'vo_title', 'url']

numerical_features = ['length']
date_feature = ['date']
categorical_features = ['directors', 'editor']
list_categorical_features = ['genre', 'langage']


target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough'
)

lasso_model = Pipeline([
    ('preprocessor', preprocessor),  
    ('lassomodel', Lasso(alpha=10, random_state=42))
])

lasso_model.fit(X_train, y_train)
y_pred = lasso_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MAPE: {mape} %")
print(f"RMSE: {rmse}")
print(f"R²: {r2}")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Test avec les acteurs