In [None]:
import pandas as pd
# from azure.ai.ml import MLClient
# from azure.identity import DefaultAzureCredential
from sklearn.model_selection import train_test_split

# ml_client = MLClient.from_config(credential=DefaultAzureCredential())
# data_asset = ml_client.data.get("allo_cine", version="1.1")

df = pd.read_parquet("allocine_spider_clean.parquet")
df.head()

Unnamed: 0,actors,critics_score,date,director,editor,genre,langage,length,nationality,viewers_score,french_first_week_boxoffice,french_visa,title,vo_title,url
0,"[Ami Tôma, Mana Ashida, Yûki Kaji]",3.5,2023-09-06,Keiichi Hara,Eurozoom,"[Animation, Drame, Fantastique]",[Japonais],116,[Japon],3.9,20761,160405,Le Château Solitaire dans le Miroir,Kagami no Kojou,/video/player_gen_cmedia=19602173&cfilm=311559...
1,"[Bárbara Lennie, Irene Escolar, Itziar Manero]",3.5,2023-11-29,Itsaso Arana,Arizona Distribution,[Comédie dramatique],[Espagnol],85,[Espagne],3.0,2991,160976,Les Filles vont bien,Las chicas están bien,/article/fichearticle_gen_carticle=1000099469....
2,,3.5,2024-07-10,Rashin Kheyrieh,Les Films du Whippet,"[Animation, Famille]",[Silencieux],38,"[Iran, Russie]",2.8,106,2024002054,L'Arbre à Contes,,/article/fichearticle_gen_carticle=1000089882....
3,"[Colman Domingo, Clarence Maclin, Sean San Jose]",3.5,2025-01-29,Greg Kwedar,Metropolitan FilmExport,[Drame],[Anglais],107,[U.S.A.],3.9,24945,163554,Sing Sing,,/article/fichearticle_gen_carticle=1000131867....
4,"[Fanny Ardant, Mathieu Kassovitz, Laetitia Dosch]",3.5,2024-03-13,Thierry Klifa,Apollo Films,"[Comédie dramatique, Policier]",[Français],116,[France],2.9,38177,152501,Les Rois de la Piste,,/article/fichearticle_gen_carticle=1000070726....


In [4]:
features_of_interest = [
    'actors',
    'critics_score',
    'date',
    'director',
    'editor',
    'genre',
    'langage',
    'length',
    'nationality',
    'viewers_score'
]

info_film = ['french_visa', 'title', 'vo_title', 'url']


list_categorical_features = ['actors', 'genre', 'langage', 'nationality']
for col in list_categorical_features:
    df[col] = df[col].mask(df[col].isna(), ['no value'])

target = 'french_first_week_boxoffice'

X, y = (
    df[features_of_interest],
    df[target]
)
# df.drop(target, axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1, random_state=42)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

numerical_features = ['critics_score', 'length', 'viewers_score']
date_feature = ['date']
categorical_features = ['director', 'editor']
list_categorical_features = ['actors', 'genre', 'langage', 'nationality']

class MultiLabelBinarizerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlbs = {}  # Stocke un MultiLabelBinarizer pour chaque colonne
    
    def fit(self, X, y=None):
        for col in X.columns:
            self.mlbs[col] = MultiLabelBinarizer()
            self.mlbs[col].fit(X[col])
        return self
    
    def transform(self, X):
        transformed_list = []
        for col in X.columns:
            transformed = self.mlbs[col].transform(X[col])
            new_columns = [f"{col}_{label}" for label in self.mlbs[col].classes_]
            transformed_list.append(pd.DataFrame(transformed, columns=new_columns, index=X.index))
        
        return pd.concat(transformed_list, axis=1)

# class ListImputer(BaseEstimator, TransformerMixin):
#     def __init__(self, fill_value=['no actor']):
#         self.fill_value = fill_value
        
#     def fit(self, X, y=None):
#         return self
        
#     def transform(self, X):
#         X_copy = X.copy()
#         for i in range(len(X_copy)):
#             if pd.isna(X_copy[i]):
#                 X_copy[i] = self.fill_value
#         return X_copy

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Bourin... à changer mais j'ai même pas vérifier s'il manquait des données...
    ('scaler', StandardScaler())
])

# Pour les dates je recréer plusieurs colonnes, ptetre rajouter vacances et tout... mais dans l'idée j'aimerais bien utiliser à terme un time model
date_transformer = Pipeline(steps=[
    ('date_features', FunctionTransformer(lambda x: pd.DataFrame({
        'year': x['date'].dt.year,
        'month': x['date'].dt.month,
        'day': x['date'].dt.day,
        'dayofweek': x['date'].dt.dayofweek
    })))
])

# Toujours bourin :p
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

list_categorical_transformer = Pipeline(steps=[
    ('multi_label', MultiLabelBinarizerTransformer())
])

# Fonction pour appliquer MultiLabelBinarizer sur une colonne contenant des listes
# def multilabel_binarizer_transform(df, column):
#     """Applique MultiLabelBinarizer sur une colonne et renvoie le dataframe transformé"""
#     mlb = MultiLabelBinarizer()
#     transformed = mlb.fit_transform(df[column])
#     new_columns = [f"{column}_{label}" for label in mlb.classes_]
#     return pd.DataFrame(transformed, columns=new_columns, index=df.index)

# # Appliquer MultiLabelBinarizer sur les colonnes contenant des listes
# for col in list_categorical_features:
#     X_train[col] = X_train[col].mask(X_train[col].isna(), ['no actor'])
#     transformed_df = multilabel_binarizer_transform(X_train, col)
#     X_train = pd.concat([X_train, transformed_df], axis=1)
#     X_train.drop(columns=[col], inplace=True)  # Supprime la colonne originale

# Création du préprocesseur (sans les colonnes déjà transformées)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('date', date_transformer, date_feature),
        ('cat', categorical_transformer, categorical_features),
        ('list', list_categorical_transformer, list_categorical_features)
    ],
    remainder='passthrough',
    sparse_threshold=0 # J'ai pas compris revenir la dessus
)

#        ('list', list_categorical_transformer, list_categorical_features)

In [7]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import root_mean_squared_error as rmse
dummy_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DummyRegressor(strategy='mean'))
])
dummy_model.fit(X_train,y_train)
y_pred = dummy_model.predict(X_test)
rmse(y_test, y_pred)



206767.48441467813