In [127]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from datetime import date
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MultiLabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array
import re

In [128]:
df_movie = pd.read_parquet("df_movie.parquet")
pd.set_option('display.max_columns', None)
df_movie.head(3)

Unnamed: 0,id_movie,original_language,popularity_movie,release_date,title,vote_average,vote_count,genre,gender_director,name_director,popularity_director,birthday_director,deathday_director,place_of_birth_director,actor_1_mean_movie,actor_2_mean_movie,actor_3_mean_movie,actor_4_mean_movie,actor_5_mean_movie,actor_gender_1,actor_gender_2,actor_gender_3,actor_gender_4,actor_gender_5,actor_popularity_1,actor_popularity_2,actor_popularity_3,actor_popularity_4,actor_popularity_5
0,11224,en,79.005,1970,Cendrillon,7.041,6685,"[Family, Fantasy, Animation, Romance]",2,Hamilton Luske,2.479,1900,1960,"Chicago, Illinois, USA",79.0,62.53,62.53,79.0,79.0,1.0,1.0,1.0,1.0,1.0,5.25,7.009,8.036,4.592,2.626
1,389,en,58.94,1970,12 Hommes en colère,8.5,8655,[Drama],2,Sidney Lumet,4.824,1920,2010,"Philadelphia, Pennsylvania, USA",27.72,27.4,44.92,58.94,58.94,2.0,2.0,2.0,2.0,2.0,9.78,7.985,10.268,9.454,4.698
2,6844,en,56.194,1970,Les Dix Commandements,7.756,1603,"[Drama, History]",2,Cecil B. DeMille,6.705,1880,1950,"Ashfield, Massachusetts, USA",29.45,38.47,56.19,56.19,56.19,2.0,2.0,1.0,2.0,1.0,22.823,8.561,9.878,9.671,15.539


In [129]:
numeric_cols = ['popularity_movie', 'vote_average', 'vote_count',
       'gender_director', 'popularity_director', 'birthday_director', 'actor_2_mean_movie', 'actor_3_mean_movie',
       'actor_4_mean_movie', 'actor_5_mean_movie', 'actor_gender_1',
       'actor_gender_2', 'actor_gender_3', 'actor_gender_4', 'actor_gender_5',
       'actor_popularity_1', 'actor_popularity_2', 'actor_popularity_3',
       'actor_popularity_4', 'actor_popularity_5']


categorical_cols = ['original_language', 'name_director', 'place_of_birth_director']

binarizer_cols = ['genre']

date_cols = ['release_date', 'birthday_director', 'deathday_director']

In [130]:
# Custom transformer for MultiLabelBinarizer
class MultiLabelBinarizerPipelineFriendly(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y=None):
        self.mlb.fit(X)
        return self

    def transform(self, X):
        return self.mlb.transform(X)

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_

In [131]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

date_transformer = Pipeline(steps=[
    ('date', OrdinalEncoder())
])

# On combine tout dans un ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('genres', MultiLabelBinarizerPipelineFriendly(), 'genre'),
        ('date', date_transformer, date_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [132]:
pipeline

In [133]:
# Transformation des données
processed_data = pipeline.fit_transform(df_movie)

# Affichage des données transformées
transformed_df = pd.DataFrame(processed_data.todense(), columns=pipeline["preprocessor"].get_feature_names_out())

In [134]:
processed_data.todense().shape

(5771, 3863)

In [135]:
# Initialisation du modèle KNN
knn_model = NearestNeighbors(n_neighbors=6, metric='manhattan')  # 5 voisins les plus proches hormis le point de référence lui meme
knn_model.fit(processed_data)

In [136]:
# Exemple de recommandation pour un item
def recommend_similar_films(title, 
                              data: pd.DataFrame=processed_data, 
                              model: NearestNeighbors=knn_model, 
                              original_data: pd.DataFrame=df_movie, 
                              n_neighbors: int=6) -> tuple[list[float], list[float]]:
    """
    Trouve les n éléments les plus proches pour un élément donné.

    Args:
        title: le titre d'un film
        data: Données transformées utilisées pour KNN = df processed data
        model: Modèle KNN pré-entraîné.
        original_data: Données originales (pour affichage) = df movies
        n_neighbors: Nombre de voisins à recommander = 6 pour avoir 5 recommandations

    Returns:
        films similaires: Indices et distances des filmes similaires.
    """
    index = original_data[original_data['title'] == title].index[0] # l'indice du filme recherché 
    distances, indices = model.kneighbors(data[index], n_neighbors=n_neighbors) 
    
    indices = indices[0][1:]
    id_movie = [int(original_data.loc[i, 'id_movie']) for i in indices] 
    return id_movie



In [137]:
recommend_similar_films('Dune')

found 0 physical cores < 1
  File "c:\Users\sulta\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


[985, 9426, 709, 2756, 638]

In [138]:
df_final_compile = pd.read_parquet("df_final_compile.parquet")
pd.set_option('display.max_columns', None)
df_final_compile.head(3)

Unnamed: 0,id_movie,backdrop_path,original_language,overview,popularity_movie,poster_path,release_date,title,vote_average,vote_count,genre,gender_director,name_director,popularity_director,birthday_director,deathday_director,place_of_birth_director,movie_id,actor_name_1,actor_name_2,actor_name_3,actor_name_4,actor_name_5,actor_gender_1,actor_gender_2,actor_gender_3,actor_gender_4,actor_gender_5,actor_popularity_1,actor_popularity_2,actor_popularity_3,actor_popularity_4,actor_popularity_5,actor_birthday_1,actor_birthday_2,actor_birthday_3,actor_birthday_4,actor_birthday_5
0,11224,/rH0DPF7pB35jxLxKb3JRUgCrrnp.jpg,en,"D’un coup de baguette magique, la fée Marraine...",79.005,/nqSJwBdvG89uHRpDDdaAy5YhogZ.jpg,1950-02-22,Cendrillon,7.041,6685,"[Family, Fantasy, Animation, Romance]",2,Hamilton Luske,2.479,1903-10-16,1968-02-19,"Chicago, Illinois, USA",11224,Ilene Woods,Eleanor Audley,Verna Felton,Claire Du Brey,Rhoda Williams,1.0,1.0,1.0,1.0,1.0,5.25,7.009,8.036,4.592,2.626,1929-05-05,1905-11-19,1890-07-20,1892-08-30,1930-07-03
1,389,/qqHQsStV6exghCM7zbObuYBiYxw.jpg,en,Un jeune homme d'origine modeste est accusé du...,58.94,/bPImGSvZtG2tvsJ9bVLrIECRDnB.jpg,1957-04-10,12 Hommes en colère,8.5,8655,[Drama],2,Sidney Lumet,4.824,1924-06-25,2011-04-09,"Philadelphia, Pennsylvania, USA",389,Martin Balsam,John Fiedler,Lee J. Cobb,E.G. Marshall,Jack Klugman,2.0,2.0,2.0,2.0,2.0,9.78,7.985,10.268,9.454,4.698,1919-11-04,1925-02-03,1911-12-08,1914-06-18,1922-04-27
2,6844,/rI1u4JgtnAZC99wXX4gwro4yaxe.jpg,en,Évocation de la vie de Moïse sauvé à sa naissa...,56.194,/q0KM14O75n0h4324npmThHi56FG.jpg,1956-10-05,Les Dix Commandements,7.756,1603,"[Drama, History]",2,Cecil B. DeMille,6.705,1881-08-12,1959-01-21,"Ashfield, Massachusetts, USA",6844,Charlton Heston,Yul Brynner,Anne Baxter,Edward G. Robinson,Yvonne De Carlo,2.0,2.0,1.0,2.0,1.0,22.823,8.561,9.878,9.671,15.539,1923-10-04,1920-07-11,1923-05-07,1893-12-11,1922-09-01


In [139]:
df_final_compile[df_final_compile['actor_name_5'] == "Martin Balsam"]

Unnamed: 0,id_movie,backdrop_path,original_language,overview,popularity_movie,poster_path,release_date,title,vote_average,vote_count,genre,gender_director,name_director,popularity_director,birthday_director,deathday_director,place_of_birth_director,movie_id,actor_name_1,actor_name_2,actor_name_3,actor_name_4,actor_name_5,actor_gender_1,actor_gender_2,actor_gender_3,actor_gender_4,actor_gender_5,actor_popularity_1,actor_popularity_2,actor_popularity_3,actor_popularity_4,actor_popularity_5,actor_birthday_1,actor_birthday_2,actor_birthday_3,actor_birthday_4,actor_birthday_5
86,539,/uif5fUshJrXyyDzfpzp1DLw3N0S.jpg,en,Marion Crane en a assez de ne pouvoir mener sa...,45.647,/jsyBbKrEhntRgIsIJVBBa39ZErh.jpg,1960-06-22,Psychose,8.429,10061,"[Horror, Thriller, Mystery]",2,Alfred Hitchcock,10.377,1899-08-13,1980-04-29,"Leytonstone, London, England, UK",539,Anthony Perkins,Janet Leigh,Vera Miles,John Gavin,Martin Balsam,2.0,1.0,1.0,2.0,2.0,12.232,10.725,13.957,6.335,9.78,1932-04-04,1927-07-06,1929-08-23,1931-04-08,1919-11-04
109,164,/1HMoIkfVHckgXFWrabQS7uXPF3W.jpg,en,Holly Golightly est une délicieuse call-girl t...,23.972,/nauABSXgv994NIlqWlMtuV51eqj.jpg,1961-10-06,Diamants sur canapé,7.7,4146,"[Comedy, Romance, Drama]",2,Blake Edwards,10.317,1922-07-26,2010-12-15,"Tulsa, Oklahoma, USA",164,Audrey Hepburn,George Peppard,Patricia Neal,Buddy Ebsen,Martin Balsam,1.0,2.0,1.0,2.0,2.0,15.603,18.963,8.371,11.754,9.78,1929-05-04,1928-10-01,1926-01-20,1908-04-02,1919-11-04


In [140]:
def id_movie_actor(name_actor: str, df: pd.DataFrame) -> int: 
    current = {}

    for i in range(1, 6): 
        col_name = f"actor_name_{i}"
        temp = {}
        df_current = df.groupby(col_name)['id_movie'].agg(list)
        df_current = pd.DataFrame(df_current)
        df_current = df_current.reset_index()
        
        for y in range(0, len(df_current)):
            name = df_current.loc[y, col_name]
            temp[name] = df_current.loc[y, 'id_movie']
        
        for k, v in temp.items():
            if k in current: 
                current[k] += temp[k]
            else: 
                current[k] = temp[k]
    return current[name_actor]

In [141]:
id_movie_actor("50 Cent", df= df_final_compile)

[10060, 299054, 440471, 13389, 449443, 307081, 139567, 44982]

In [142]:
df_final_compile.groupby('name_director')['id_movie'].agg(list)

name_director
Aamir Khan                                 [7508]
Aaron Hann                               [335866]
Aaron Schneider                          [516486]
Aaron Sorkin             [396371, 556984, 517088]
Abbas Kiarostami                          [30020]
                                   ...           
Àlex Pastor                              [674944]
Álex de la Iglesia        [12245, 435126, 179538]
Ángel Gómez Hernández                    [726208]
Ángel Manuel Soto                        [565770]
Éric Warin                               [342473]
Name: id_movie, Length: 2556, dtype: object

In [143]:
def id_movie_director(name_director: str, df: pd.DataFrame) -> int: 
    df_current = df.groupby('name_director')['id_movie'].agg(list)
    df_current = pd.DataFrame(df_current)
    df_current = df_current.reset_index()
    index = df_current[df_current['name_director'] == name_director].index
    return list(df_current.loc[index, 'id_movie'])[0]

In [144]:
id_movie_director("Quentin Tarantino", df_final_compile)

[680, 500, 184, 5, 16869, 24, 1991, 68718, 466272, 273248, 414419]