In [4]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from scipy.sparse import hstack, csr_matrix
import ast
import gc

In [5]:
# Charger les données
df = pd.read_csv('clean_final2.csv')

In [6]:
df.isna().sum()

title                   0
vote_average            0
vote_count              0
release_date            0
revenue                 0
runtime                 0
budget                  0
imdb_id                 0
overview                0
popularity              0
poster_path             0
genres                  0
production_companies    0
keywords                0
id_actor                0
id_productor            0
actor_name              0
knownForTitles          0
productor_name          0
dtype: int64

In [7]:
# POIDS CONFIGURABLES
weights = {
    'text': 0.30,        # overview + keywords
    'genres': 0.25,     # genres
    'actors': 0.15,      # acteurs
    'directors': 0.20,  # producteurs
    'numeric': 0.1      # budget
}

print("Préparation des données...")

Préparation des données...


In [8]:
# Nettoyer les colonnes texte
df['overview'] = df['overview'].fillna('')
df['keywords'] = df['keywords'].fillna('')
df['actor_name'] = df['actor_name'].fillna('')
df['productor_name'] = df['productor_name'].fillna('')


In [9]:
# Préparer les genres (si c'est une string, la convertir en liste)
def parse_genres(x):
    if pd.isna(x) or x == '':
        return []
    try:
        return ast.literal_eval(x) if isinstance(x, str) else x
    except:
        return [item.strip() for item in str(x).split(',') if item.strip()]

df['genres'] = df['genres'].apply(parse_genres)

In [10]:
# Limiter les acteurs aux 5 premiers (pour économiser la RAM)
df['top_actors'] = df['actor_name'].apply(lambda x: x.split(',')[:5] if x else [])
df['top_actors'] = df['top_actors'].apply(lambda x: [item.strip() for item in x if item.strip()])

# Limiter les réalisateurs aux 5 premiers
df['top_directors'] = df['productor_name'].apply(lambda x: x.split(',')[:5] if x else [])
df['top_directors'] = df['top_directors'].apply(lambda x: [item.strip() for item in x if item.strip()])

In [11]:
features_list = []

In [12]:
# 1. Features textuelles (TF-IDF)
if weights['text'] > 0:
    text_data = df['overview'] + ' ' + df['keywords']
    tfidf = TfidfVectorizer(stop_words='english', max_features=3000, max_df=0.8, min_df=2)
    tfidf_matrix = tfidf.fit_transform(text_data)
    tfidf_matrix = tfidf_matrix * weights['text']  # Appliquer le poids
    features_list.append(tfidf_matrix)
    del text_data
    gc.collect()

In [13]:
# 2. Features des genres
if weights['genres'] > 0:
    mlb_genres = MultiLabelBinarizer()
    genres_encoded = mlb_genres.fit_transform(df['genres'])
    genres_encoded = csr_matrix(genres_encoded * weights['genres'])
    features_list.append(genres_encoded)

In [14]:
# 3. Features des acteurs avec embeddings
if weights['actors'] > 0:
    from sklearn.feature_extraction.text import CountVectorizer
    
    # Convertir les listes d'acteurs en texte
    actors_text = df['top_actors'].apply(lambda x: ' '.join(x) if x else '')
    
    # Vectorisation avec limitation
    actor_vectorizer = CountVectorizer(max_features=200, binary=True)
    actors_encoded = actor_vectorizer.fit_transform(actors_text)
    actors_encoded = csr_matrix(actors_encoded * weights['actors'])
    features_list.append(actors_encoded)

In [15]:
# 4. Features des réalisateurs avec embeddings
if weights['directors'] > 0:
    # Convertir les listes de réalisateurs en texte
    directors_text = df['top_directors'].apply(lambda x: ' '.join(x) if x else '')
    
    # Vectorisation avec limitation
    director_vectorizer = CountVectorizer(max_features=100, binary=True)
    directors_encoded = director_vectorizer.fit_transform(directors_text)
    directors_encoded = csr_matrix(directors_encoded * weights['directors'])
    features_list.append(directors_encoded)

In [16]:
# 5. Features numériques
if weights['numeric'] > 0:
    num_features = df[['budget']].fillna(0)
    scaler = MinMaxScaler()
    num_scaled = scaler.fit_transform(num_features)
    num_scaled = csr_matrix(num_scaled * weights['numeric'])
    features_list.append(num_scaled)

In [17]:
# Combiner toutes les features
print("Combinaison des features...")
combined_features = hstack(features_list)
print(f"Shape finale: {combined_features.shape}")

Combinaison des features...
Shape finale: (29104, 3320)


In [18]:
# Entraîner le modèle
print("Entraînement du modèle...")
nn_model = NearestNeighbors(metric='cosine', algorithm='brute', n_jobs=-1)
nn_model.fit(combined_features)

Entraînement du modèle...


In [19]:
# Score de qualité pour boost
quality_features = df[['vote_average', 'popularity', 'vote_count']].fillna(0)
quality_scaler = RobustScaler()                    # RobustScaler
quality_scaled = quality_scaler.fit_transform(quality_features)
quality_score = quality_scaled.mean(axis=1)
quality_score = (quality_score - quality_score.min()) / (quality_score.max() - quality_score.min())

print("Prêt pour les recommandations!")

Prêt pour les recommandations!


In [20]:
def get_recommendations(title, top_n=10):
    # Trouver l'index du film
    idx = df[df['title'].str.lower() == title.lower()].index
    if len(idx) == 0:
        return f"Le film '{title}' est introuvable."
    idx = idx[0]
    
    # Rechercher les films similaires
    distances, indices = nn_model.kneighbors(combined_features[idx], n_neighbors=top_n*2)           # attention répétition
    
    # Exclure le film lui-même
    similar_indices = indices.flatten()[1:]
    similarities = 1 - distances.flatten()[1:]
    
    # Créer les résultats
    results = pd.DataFrame({
        'title': df.iloc[similar_indices]['title'].values,
        'similarity': similarities,
        'vote_average': df.iloc[similar_indices]['vote_average'].values,
        'popularity' : df.iloc[similar_indices]['popularity'],
        'quality_score': quality_score[similar_indices]
    })
    
    # Score final : similarité (90%) + qualité (10%)
    results['final_score'] = results['similarity'] * 0.9 + results['quality_score'] * 0.1
    results = results.sort_values('final_score', ascending=False)
    
    return results.head(top_n)[['title', 'similarity', 'vote_average', 'popularity', 'final_score']]

In [28]:
# Test
film_input = input("Entrez le nom d'un film : ")
print(f"\n=== Test avec {film_input} ===")
recommendations = get_recommendations(film_input)
print(recommendations)


=== Test avec Skyfall ===
                       title  similarity  vote_average  popularity  \
236                  Spectre    0.839395         6.533      33.598   
232            Casino Royale    0.688995         7.543      43.407   
407        Quantum of Solace    0.658598         6.314      33.985   
2853   Never Say Never Again    0.648082         6.063      22.607   
2102        Live and Let Die    0.640410         6.484      25.922   
1473   From Russia with Love    0.630075         7.072      27.531   
1190                  Dr. No    0.627330         6.995      28.018   
1220         Die Another Day    0.627042         5.963      28.423   
21377  When Eight Bells Toll    0.630859         5.700       8.610   
1865             Thunderball    0.625201         6.644      27.879   

       final_score  
236       0.767326  
232       0.632583  
407       0.602045  
2853      0.586239  
2102      0.580066  
1473      0.571798  
1190      0.569896  
1220      0.569341  
21377     0.5