In [2]:
import pandas as pd
# from azure.ai.ml import MLClient
# from azure.identity import DefaultAzureCredential
from sklearn.model_selection import train_test_split

# ml_client = MLClient.from_config(credential=DefaultAzureCredential())
# data_asset = ml_client.data.get("allo_cine", version="1.1")

df = pd.read_parquet("allocine_spider_clean.parquet")
df.head()

Unnamed: 0,actors,critics_score,date,director,editor,genre,langage,length,nationality,viewers_score,french_first_week_boxoffice,french_visa,title,vo_title,url
0,"[Amruta Subhash, Archit Deodhar, Parth Bhalerao]",3.5,2015-10-07,Avinash Arun,Les Films du Préau,"[Aventure, Drame]",[Tamoul],107,[Inde],3.3,735,142936,La Forteresse,Killa,/article/fichearticle_gen_carticle=18646392.html
1,"[Yvan Attal, Sophie Quinton, Nicole Garcia]",3.5,2012-03-14,Lucas Belvaux,Diaphana Distribution,[Drame],[Français],104,[France],2.6,104634,127961,38 témoins,,/diaporamas/cinema/diaporama-18712695/
2,"[Thure Lindhardt, Zachary Booth, Julianne Nich...",3.5,2012-08-22,Ira Sachs,KMBO,[Drame],"[Danois, Anglais]",101,[U.S.A.],3.4,5908,133557,Keep the Lights On,,/article/fichearticle_gen_carticle=18616043.html
3,"[Rita Blanco, Joaquim de Almeida, Roland Giraud]",3.5,2013-04-24,Ruben Alves,Pathé Films,[Comédie],"[Anglais, Français]",91,"[Portugal, France]",3.9,288750,124489,La Cage Dorée,,/video/player_gen_cmedia=19498177&cfilm=109860...
4,"[Jessica Chastain, Jason Clarke, Joel Edgerton]",4.1,2013-01-23,Kathryn Bigelow,Universal Pictures International France,"[Action, Thriller]",[Anglais],149,[U.S.A.],3.9,237214,135377,Zero Dark Thirty,,/video/player_gen_cmedia=19433296&cfilm=193444...


In [3]:
features_of_interest = [
    'actors',
    'critics_score',
    'date',
    'director',
    'editor',
    'genre',
    'langage',
    'length',
    'nationality',
    'viewers_score'
]

info_film = ['french_visa', 'title', 'vo_title', 'url']


list_categorical_features = ['actors', 'genre', 'langage', 'nationality']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])

target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
# df.drop(target, axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

numerical_features = ['critics_score', 'length', 'viewers_score']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['actors', 'genre', 'langage', 'nationality']

class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlbs = {}  # Stocke un MultiLabelBinarizer pour chaque colonne
    
    def fit(self, X, y=None):
        for col in X.columns:
            self.mlbs[col] = MultiLabelBinarizer()
            self.mlbs[col].fit(X[col])
        return self
    
    def transform(self, X):
        transformed_list = []
        for col in X.columns:
            transformed = self.mlbs[col].transform(X[col])
            new_columns = [f"{col}_{label}" for label in self.mlbs[col].classes_]
            transformed_list.append(pd.DataFrame(transformed, columns=new_columns, index=X.index))
        
        return pd.concat(transformed_list, axis=1)

# class ListImputer(BaseEstimator, TransformerMixin):
#     def __init__(self, fill_value=['no actor']):
#         self.fill_value = fill_value
        
#     def fit(self, X, y=None):
#         return self
        
#     def transform(self, X):
#         X_copy = X.copy()
#         for i in range(len(X_copy)):
#             if pd.isna(X_copy[i]):
#                 X_copy[i] = self.fill_value
#         return X_copy

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Bourin... à changer mais j'ai même pas vérifier s'il manquait des données...
    ('scaler', StandardScaler())
])

# Pour les dates je recréer plusieurs colonnes, ptetre rajouter vacances et tout... mais dans l'idée j'aimerais bien utiliser à terme un time model
date_transformer = Pipeline(steps=[
    ('date_features', FunctionTransformer(lambda x: pd.DataFrame({
        'year': x['date'].dt.year,
        'month': x['date'].dt.month,
        'day': x['date'].dt.day,
        'dayofweek': x['date'].dt.dayofweek
    })))
])

# Toujours bourin :p
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

list_categorical_transformer = Pipeline(steps=[
    ('multi_label', MultiLabelBinarizerTransformer())
])

# Fonction pour appliquer MultiLabelBinarizer sur une colonne contenant des listes
# def multilabel_binarizer_transform(df, column):
#     """Applique MultiLabelBinarizer sur une colonne et renvoie le dataframe transformé"""
#     mlb = MultiLabelBinarizer()
#     transformed = mlb.fit_transform(df[column])
#     new_columns = [f"{column}_{label}" for label in mlb.classes_]
#     return pd.DataFrame(transformed, columns=new_columns, index=df.index)

# # Appliquer MultiLabelBinarizer sur les colonnes contenant des listes
# for col in list_categorical_features:
#     X_train[col] = X_train[col].mask(X_train[col].isna(), ['no actor'])
#     transformed_df = multilabel_binarizer_transform(X_train, col)
#     X_train = pd.concat([X_train, transformed_df], axis=1)
#     X_train.drop(columns=[col], inplace=True)  # Supprime la colonne originale

# Création du préprocesseur (sans les colonnes déjà transformées)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough',
    sparse_threshold=0 # J'ai pas compris revenir la dessus
)

#        ('list', list_categorical_transformer, list_categorical_features)

In [6]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import root_mean_squared_error
dummy_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DummyRegressor(strategy='mean'))
])
dummy_model.fit(X_train,y_train)
y_pred = dummy_model.predict(X_test)
root_mean_squared_error(y_test, y_pred)



310045.1071285374

In [7]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor())
])

gb_model.fit(X_train,y_train)
y_pred = gb_model.predict(X_test)
root_mean_squared_error(y_test, y_pred)



209858.24683131315