✅ STEP 1 – Define the Problem <br>
Given a new movie’s metadata, recommend a set of 4–5 actors who are collectively a great fit.

تحميل البيانات

In [2]:
# -------------------------------
# ✅ Imports & Setup
# -------------------------------
import pandas as pd
import numpy as np
import json
import random
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

random.seed(42)

In [3]:

# Replace 'your_dataframe_file.csv' with the actual file path
# df = pd.read_csv('lNH_data.csv')
import gdown

file_id = "1fQ6NfLqpjuji_aEUjv_8lQZ1QffAHztt"  # Replace with your actual file ID
output_file = "downloaded_file.csv"  # You can change the output file name

gdown.download(id=file_id, output=output_file, quiet=False)



Downloading...
From: https://drive.google.com/uc?id=1fQ6NfLqpjuji_aEUjv_8lQZ1QffAHztt
To: /content/downloaded_file.csv
100%|██████████| 46.7M/46.7M [00:00<00:00, 75.6MB/s]


'downloaded_file.csv'

In [4]:



# -------------------------------
# ✅ Load Data
# -------------------------------
movies_df = pd.read_csv(output_file)  # change to your actual path

# -------------------------------
# ✅ Step 1: Extract Actor Lists
# -------------------------------
def extract_actor_list(credits_str):
    # Split the string by hyphens and strip whitespace from each actor name
    return [actor.strip() for actor in credits_str.split('-') if actor.strip()]

# Example usage assuming your dataframe is named movies_df
movies_df['actor_list'] = movies_df['credits'].apply(extract_actor_list)

# To verify, print the first actor list
print(movies_df['actor_list'].iloc[0])


# Get all unique actors
all_actors = list(set([a for actors in movies_df['actor_list'] for a in actors]))

# -------------------------------
# ✅ Step 2: Generate Movie–Actor Pairs
# -------------------------------
positive_rows = []
negative_rows = []

def generate_negative_samples(row, n=5):
    actual = set(row['actor_list'])
    negatives = list(set(random.sample(all_actors, n + len(actual))) - actual)
    return negatives[:n]

for idx, row in movies_df.iterrows():
    for actor in row['actor_list']:
        positive_rows.append({'movie_id': row['id'], 'actor': actor, 'label': 1, **row.to_dict()})
    for neg_actor in generate_negative_samples(row):
        negative_rows.append({'movie_id': row['id'], 'actor': neg_actor, 'label': 0, **row.to_dict()})

pairs_df = pd.DataFrame(positive_rows + negative_rows)

# -------------------------------
# ✅ Step 3: Feature Engineering
# -------------------------------



['Jason Statham', 'Wu Jing', 'Shuya Sophia Cai', 'Sergio Peris', 'Mencheta', 'Skyler Samuels', 'Cliff Curtis', 'Page Kennedy', 'Sienna Guillory', 'Melissanthi Mahut', 'Kiran Sonia Sawar', 'Felix Mayr', 'Whoopie van Raam', 'Guo Tao', 'Robin Hill', 'Dai Lele', 'Sui Fong Ivy Tsui', 'Stewart Alexander', 'Li Xin', 'Billy Clements', 'Ron Smoorenburg', 'Rui Shang', 'Sara Dee', 'Jonny James', 'Bai Narisu', 'Kenneth Won', 'Able Wanamakok']


In [1]:
!pip uninstall -y cudf dask-cudf cuml rmm numba-cuda numba_cuda rapids-dask-dependency


[0mFound existing installation: numba-cuda 0.11.0
Uninstalling numba-cuda-0.11.0:
  Successfully uninstalled numba-cuda-0.11.0
Found existing installation: rapids-dask-dependency 25.6.0
Uninstalling rapids-dask-dependency-25.6.0:
  Successfully uninstalled rapids-dask-dependency-25.6.0


In [8]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
from numba import njit  # only for fast Jaccard

# -----------------------------
# 1. Genre Parser
# -----------------------------
def parse_genres(genre_str):
    try:
        genres = eval(genre_str)
        return set(g.strip().lower() for g in genres if isinstance(g, str))
    except:
        return set()

movies_df['parsed_genres'] = movies_df['genres_x'].apply(parse_genres)

# -----------------------------
# 2. Build Actor → Genres Map
# -----------------------------
actor_genres_map = {}
for actor in all_actors:
    actor_movies = movies_df[movies_df['actor_list'].apply(lambda x: actor in x)]
    genres = set(g for genre_list in actor_movies['parsed_genres'] for g in genre_list)
    actor_genres_map[actor] = genres

# -----------------------------
# 3. Jaccard Function (CPU-Optimized)
# -----------------------------
@njit
def jaccard(set1_list, set2_list):
    set1 = set(set1_list)
    set2 = set(set2_list)
    union = set1 | set2
    if not union:
        return 0.0
    return len(set1 & set2) / len(union)

# Pre-convert genres to list (for Numba)
pairs_df['parsed_genres'] = pairs_df['genres'].apply(parse_genres)
pairs_df['parsed_genres_list'] = pairs_df['parsed_genres'].apply(list)

# Apply Jaccard safely
pairs_df['genre_match_score'] = pairs_df.apply(
    lambda row: jaccard(row['parsed_genres_list'], list(actor_genres_map.get(row['actor'], set()))),
    axis=1
)

# -----------------------------
# 4. Actor Average Profit Map
# -----------------------------
actor_profit_map = defaultdict(list)
for idx, row in movies_df.iterrows():
    for actor in row['actor_list']:
        actor_profit_map[actor].append(row['profit'])

actor_profit_avg = {actor: np.median(profits) for actor, profits in actor_profit_map.items()}
pairs_df['actor_avg_profit'] = pairs_df['actor'].map(actor_profit_avg)

# -----------------------------
# 5. Fill Missing Values (Using Median)
# -----------------------------
for col in ['genre_match_score', 'actor_avg_profit', 'actors_avg_rating', 'actors_bayesian_rating']:
    median_val = pairs_df[col].median()
    pairs_df[col] = pairs_df[col].fillna(median_val)


KeyboardInterrupt: 

In [None]:
# -------------------------------
# ✅ Step 4: Train ML Model
# -------------------------------
features = ['budget', 'popularity', 'genre_match_score', 'actors_avg_rating',
            'actors_bayesian_rating', 'actor_avg_profit']

X = pairs_df[features]
y = pairs_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42,verbose=1)
model.fit(X_train, y_train)

print("\n📊 Evaluation Report:")
print(classification_report(y_test, model.predict(X_test)))



In [None]:

# -------------------------------
# ✅ Step 5: Recommendation Function
# -------------------------------
def recommend_actors_for_movie(movie_row, actor_list, model, top_n=5):
    rows = []
    parsed_genres = parse_genres(movie_row['genres'])

    for actor in actor_list:
        genre_match_score = jaccard(parsed_genres, actor_genres_map.get(actor, set()))
        row = {
            'budget': movie_row['budget'],
            'popularity': movie_row['popularity'],
            'genre_match_score': genre_match_score,
            'actors_avg_rating': movies_df[movies_df['actor_list'].apply(lambda x: actor in x)]['actors_avg_rating'].mean() or 5.5,
            'actors_bayesian_rating': movies_df[movies_df['actor_list'].apply(lambda x: actor in x)]['actors_bayesian_rating'].mean() or 5.0,
            'actor_avg_profit': actor_profit_avg.get(actor, 0),
            'actor': actor
        }
        rows.append(row)

    pred_df = pd.DataFrame(rows)
    pred_df['score'] = model.predict_proba(pred_df[features])[:, 1]
    return pred_df.sort_values('score', ascending=False).head(top_n)[['actor', 'score']]

# -------------------------------
# ✅ Step 6: Try a Real Movie Example
# -------------------------------
# Pick any movie from your dataset
example_movie = movies_df.iloc[0]  # or filter by genre/budget etc

top_actors = recommend_actors_for_movie(example_movie, all_actors, model, top_n=5)

print("\n🎬 Recommended Actors for:")
print(f"🎞️ Title: {example_movie['title']} | Genre: {example_movie['genres']}")
print(top_actors)

# Content-based filtering

In [4]:
movies_df.head()

Unnamed: 0,id,title,genres_x,original_language,overview,popularity,production_companies,release_date,budget,revenue,...,movie_age,rating_category,age_group,normalized_primary_title,has_superstar_actor,movie_oscar,normalized_credits_list,movie_credits_oscar,company_oscars,actor_list
0,615656,Meg 2: The Trench,Action-Science Fiction-Horror,en,An exploratory dive into the deepest depths of...,8763.998,Apelles Entertainment-Warner Bros. Pictures-di...,2023-08-02,129000000.0,352056500.0,...,2.0,Average,0–10 yrs,meg2thetrench,False,0,"['jasonstatham', 'wujing', 'shuyasophiacai', '...",0,0,"[Jason Statham, Wu Jing, Shuya Sophia Cai, Ser..."
1,758323,The Pope's Exorcist,Horror-Mystery-Thriller,en,Father Gabriele Amorth Chief Exorcist of the V...,5953.227,Screen Gems-2.0 Entertainment-Jesus & Mary-Wor...,2023-04-05,18000000.0,65675820.0,...,2.0,Average,0–10 yrs,thepopesexorcist,False,0,"['russellcrowe', 'danielzovatto', 'alexessoe',...",2,0,"[Russell Crowe, Daniel Zovatto, Alex Essoe, Fr..."
2,533535,Deadpool & Wolverine,Action-Comedy-Science Fiction,en,A listless Wade Wilson toils away in civilian ...,5410.496,Marvel Studios-Maximum Effort-21 Laps Entertai...,2024-07-24,200000000.0,1326387000.0,...,1.0,Good,0–10 yrs,deadpoolwolverine,True,0,"['ryanreynolds', 'hughjackman', 'emmacorrin', ...",2,0,"[Ryan Reynolds, Hugh Jackman, Emma Corrin, Mat..."
3,667538,Transformers: Rise of the Beasts,Action-Adventure-Science Fiction,en,When a new threat capable of destroying the en...,5409.104,Skydance-Paramount-di Bonaventura Pictures-Bay...,2023-06-06,200000000.0,407045500.0,...,2.0,Average,0–10 yrs,transformersriseofthebeasts,False,0,"['anthonyramos', 'dominiquefishback', 'lunalau...",2,0,"[Anthony Ramos, Dominique Fishback, Luna Laure..."
4,693134,Dune: Part Two,Science Fiction-Adventure,en,Follow the mythic journey of Paul Atreides as ...,4742.163,Legendary Pictures,2024-02-27,190000000.0,683813700.0,...,1.0,Excellent,0–10 yrs,duneparttwo,False,2,"['timothéechalamet', 'zendaya', 'rebeccafergus...",2,4,"[Timothée Chalamet, Zendaya, Rebecca Ferguson,..."


In [5]:
# Step 1: Explode actor list
exploded_df = movies_df.explode('actor_list')

# Step 2: Group by actor to build actor profiles
actor_profiles = exploded_df.groupby('actor_list').agg({
    'genres_x': lambda x: x.mode().iloc[0] if not x.mode().empty else x.iloc[0],
    'budget': 'mean',
    'revenue': 'mean',
    'vote_average': 'mean',
    'popularity': 'mean'
}).reset_index()




In [6]:
# Rename columns for clarity
actor_profiles.rename(columns={
    'actors': 'actor_name',
    'genres_x': 'top_genre',
    'budget': 'avg_budget',
    'revenue': 'avg_revenue',
    'vote_average': 'avg_rating',
    'popularity': 'avg_popularity'
}, inplace=True)

In [7]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Genre one-hot
genre_encoder = OneHotEncoder(handle_unknown='ignore')
genre_encoded = genre_encoder.fit_transform(actor_profiles[['top_genre']]).toarray()

# Normalize numeric features
scaler = StandardScaler()
numeric_data = scaler.fit_transform(actor_profiles[['avg_budget', 'avg_revenue', 'avg_rating', 'avg_popularity']])

# Final actor vectors
actor_vectors = np.hstack([genre_encoded, numeric_data])


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Example new movie
new_movie = {
    'genres': 'Action',
    'budget': 120_000_000,
    'vote_average': 7.2,
    'popularity': 85.0
}

# Create feature vector for the new movie
new_movie_genre_encoded = genre_encoder.transform([[new_movie['genres']]]).toarray()
new_movie_numeric = scaler.transform([[new_movie['budget'], 0, new_movie['vote_average'], new_movie['popularity']]])
new_movie_vector = np.hstack([new_movie_genre_encoded, new_movie_numeric])

# Similarity
similarity = cosine_similarity(actor_vectors, new_movie_vector)

# Get top 4 actors
top_indices = similarity.flatten().argsort()[-4:][::-1]
top_actors = actor_profiles.iloc[top_indices]

# Output
print("🔮 Recommended Actors:")
print(top_actors[['actor_list', 'top_genre', 'avg_budget', 'avg_revenue']])


🔮 Recommended Actors:
           actor_list                         top_genre    avg_budget  \
232221           Sing                            Action  5.833333e+07   
263930            jun  Action-Adventure-Science Fiction  2.748000e+08   
195487       Park Seo  Action-Adventure-Science Fiction  2.748000e+08   
221210  Saagar Shaikh  Action-Adventure-Science Fiction  2.748000e+08   

        avg_revenue  
232221    7109654.0  
263930          0.0  
195487          0.0  
221210          0.0  


In [18]:
top_actors

Unnamed: 0,actor_list,top_genre,avg_budget,avg_revenue,avg_rating,avg_popularity
232221,Sing,Action,58333330.0,7109654.0,6.166667,58.105667
102592,Iman Vellani,Action-Adventure-Science Fiction,274800000.0,0.0,8.1,257.235
263930,jun,Action-Adventure-Science Fiction,274800000.0,0.0,8.1,257.235
195487,Park Seo,Action-Adventure-Science Fiction,274800000.0,0.0,8.1,257.235


# Evaluation

In [9]:
def evaluate_recommender(movies_df, actor_profiles, encoder, scaler, top_k=4, sample_size=10):
    from sklearn.metrics.pairwise import cosine_similarity
    import numpy as np
    import ast

    # Ensure actor_list is a proper list
    def safe_parse_actor_list(x):
        try:
            return ast.literal_eval(x) if isinstance(x, str) else x
        except:
            return []

    movies_df['actor_list'] = movies_df['actor_list'].apply(safe_parse_actor_list)

    # Filter valid movies
    valid_movies = movies_df[
        movies_df['genres_x'].notnull() &
        movies_df['budget'].notnull() &
        movies_df['vote_average'].notnull() &
        movies_df['popularity'].notnull() &
        movies_df['actor_list'].apply(lambda x: isinstance(x, list) and len(x) > 0)
    ]

    if len(valid_movies) == 0:
        print("❌ No valid movies found.")
        return

    sampled_movies = valid_movies.sample(n=min(sample_size, len(valid_movies)), random_state=42)

    # Precompute actor vectors
    actor_vectors = np.hstack([
        encoder.transform(actor_profiles[['top_genre']]),
        scaler.transform(actor_profiles[['avg_budget', 'avg_revenue', 'avg_rating', 'avg_popularity']])
    ])

    hits, total_actors = [], []

    for idx, row in sampled_movies.iterrows():
        true_actors = row['actor_list']
        genre = row['genres_x']
        budget = row['budget']
        vote_avg = row['vote_average']
        popularity = row['popularity']

        try:
            genre_vec = encoder.transform([[genre]])
            numeric_vec = scaler.transform([[budget, 0, vote_avg, popularity]])
            movie_vector = np.hstack([genre_vec, numeric_vec])

            sim = cosine_similarity(actor_vectors, movie_vector)
            top_indices = sim.flatten().argsort()[-top_k:][::-1]
            recommended_actors = actor_profiles.iloc[top_indices]['actor_name'].tolist()

            hit_count = len(set(recommended_actors) & set(true_actors))
            hits.append(hit_count)
            total_actors.append(len(true_actors))
        except:
            continue

    if total_actors:
        hit_rate = sum(hits) / sum(total_actors)
        print(f"🎯 Hit Rate@{top_k} on {len(hits)} movies: {hit_rate:.2%}")
    else:
        print("⚠️ Evaluation skipped — no valid actor matches found.")


In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

# Fix for newer sklearn versions: use sparse_output instead of sparse
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
actor_genres_encoded = encoder.fit_transform(actor_profiles[['top_genre']])

scaler = StandardScaler()
numeric_features = ['avg_budget', 'avg_revenue', 'avg_rating', 'avg_popularity']
actor_numeric_scaled = scaler.fit_transform(actor_profiles[numeric_features])

# Actor vectors concatenation (optional for later use)
actor_vectors = np.hstack([actor_genres_encoded, actor_numeric_scaled])

evaluate_recommender(movies_df, actor_profiles, encoder, scaler, top_k=4, sample_size=10)


⚠️ Evaluation skipped — no valid actor matches found.


# Improved way

In [17]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# 1. Define numeric and genre columns
non_genre_cols = ['actor_list', 'top_genre', 'avg_budget', 'avg_revenue', 'avg_rating', 'avg_popularity']
genre_columns = [col for col in actor_profiles.columns if col not in non_genre_cols]

# 2. Scale numeric features
scaler = StandardScaler()
numeric_features = ['avg_budget', 'avg_rating', 'avg_popularity']
scaled_numeric = scaler.fit_transform(actor_profiles[numeric_features])

# 3. Extract genre features (already numeric distribution)
actor_genres = actor_profiles[genre_columns].values

# 4. Combine genre + numeric features into actor vectors
actor_vectors = np.hstack([actor_genres, scaled_numeric])

# 5. Define recommendation function
def recommend_actors_for_movie(new_movie, actor_profiles, actor_vectors, scaler, genre_columns, top_k=4):
    # Build new movie genre distribution vector (multi-hot normalized)
    genre_distribution = np.zeros(len(genre_columns))
    genres = [g.strip() for g in new_movie['genres'].split('-')]
    for g in genres:
        if g in genre_columns:
            genre_distribution[genre_columns.index(g)] = 1 / len(genres)

    # Scale numeric new movie features (without revenue)
    numeric_input = scaler.transform([[new_movie['budget'], new_movie['vote_average'], new_movie['popularity']]])

    # Combine new movie vector
    new_movie_vector = np.hstack([genre_distribution, numeric_input.flatten()])

    # Calculate cosine similarity
    sim = cosine_similarity(actor_vectors, new_movie_vector.reshape(1, -1)).flatten()

    # Get top actor indices
    top_indices = np.argsort(sim)[-top_k:][::-1]

    # Return recommended actors with similarity score
    recommendations = actor_profiles.iloc[top_indices][['actor_list', 'avg_budget', 'avg_rating', 'avg_popularity']].copy()
    recommendations['similarity'] = sim[top_indices]
    return recommendations

# 6. Test with example movie
new_movie = {
    'genres': 'Action-Comedy',
    'budget': 75000000,
    'vote_average': 7.0,
    'popularity': 75.0
}

result = recommend_actors_for_movie(new_movie, actor_profiles, actor_vectors, scaler, genre_columns)
print(result)


              actor_list    avg_budget  avg_rating  avg_popularity  similarity
56510    Daniella Pineda  5.142857e+07    6.339429       73.208714    0.965788
141639         Ken Jeong  4.646250e+07    6.247850       34.398900    0.961170
240474       T.J. Miller  5.205000e+07    6.322850       53.454500    0.959747
154680  Linda Cardellini  5.786136e+07    6.477818       43.997682    0.958676


# New way evaluation


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# --- Assume movies_df and exploded_df are your full datasets ---

# 1. Split your dataset into train and test by year (example)
train_df = movies_df[movies_df['release_year'] <= 2015]
test_df = movies_df[movies_df['release_year'] > 2015]

# 2. Build actor profiles from train only
train_exploded = exploded_df[exploded_df['id'].isin(train_df['id'])]

# Compute actor genre distribution on train
genre_counts = pd.crosstab(train_exploded['actor_list'], train_exploded['genres_x'])
genre_dist = genre_counts.div(genre_counts.sum(axis=1), axis=0).fillna(0)

# Aggregate actor numeric stats from train movies (example columns)
train_actor_stats = train_exploded.groupby('actor_list').agg({
    'budget': 'mean',
    'revenue': 'mean',
    'vote_average': 'mean',
    'popularity': 'mean'
}).rename(columns={
    'budget': 'avg_budget',
    'revenue': 'avg_revenue',
    'vote_average': 'avg_rating',
    'popularity': 'avg_popularity'
})

# Combine genre dist + numeric features for actor profiles
actor_profiles = train_actor_stats.join(genre_dist).fillna(0)

# Scale numeric features
scaler = StandardScaler()
numeric_cols = ['avg_budget', 'avg_rating', 'avg_popularity']
actor_profiles[numeric_cols] = scaler.fit_transform(actor_profiles[numeric_cols])

# Create actor feature vectors (genre + numeric)
genre_columns = genre_dist.columns.tolist()
actor_vectors = np.hstack([actor_profiles[genre_columns].values, actor_profiles[numeric_cols].values])
actor_names = actor_profiles.index.to_list()

# 3. Define recommendation function
def recommend_actors(new_movie, actor_vectors, actor_names, scaler, genre_columns, top_k=10):
    # Movie genre distribution vector (multi-hot normalized)
    genre_vector = np.zeros(len(genre_columns))
    genres = [g.strip() for g in new_movie['genres'].split('-')]
    for g in genres:
        if g in genre_columns:
            genre_vector[genre_columns.index(g)] = 1 / len(genres)

    # Numeric features vector scaled using train scaler
    numeric_input = scaler.transform([[new_movie['budget'], new_movie['vote_average'], new_movie['popularity']]])

    # Combine features
    movie_vector = np.hstack([genre_vector, numeric_input.flatten()])

    # Compute similarity
    sim = cosine_similarity(actor_vectors, movie_vector.reshape(1, -1)).flatten()

    # Top K actors indices
    top_idx = np.argsort(sim)[-top_k:][::-1]

    return [actor_names[i] for i in top_idx]

# 4. Evaluation metrics
def precision_recall_f1(recommended, actual, k):
    recommended_k = recommended[:k]
    actual_set = set(actual)
    relevant = set(recommended_k) & actual_set
    precision = len(relevant) / k
    recall = len(relevant) / len(actual) if actual else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

# 5. Evaluate on test set
precisions, recalls, f1s = [], [], []

for _, movie in test_df.iterrows():
    # Prepare movie features for recommendation
    movie_features = {
        'genres': movie['genres_x'],  # or your column for genres
        'budget': movie['budget'],
        'vote_average': movie['vote_average'],
        'popularity': movie['popularity']
    }
    recommended_actors = recommend_actors(movie_features, actor_vectors, actor_names, scaler, genre_columns, top_k=10)
    actual_actors = movie['actor_list']  # Make sure this is a list of actor names

    # Evaluate metrics
    p, r, f = precision_recall_f1(recommended_actors, actual_actors, k=10)
    precisions.append(p)
    recalls.append(r)
    f1s.append(f)

print(f'Average Precision@10: {np.mean(precisions):.4f}')
print(f'Average Recall@10: {np.mean(recalls):.4f}')
print(f'Average F1@10: {np.mean(f1s):.4f}')
