In [7]:
import pandas as pd
import json
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
with open('imdb_movies_2000to2022.prolific.json', 'r') as f:
    imdb_data = [json.loads(line) for line in f]

In [9]:
imdb_rows = []
for entry in imdb_data:
    if "rating" in entry and "avg" in entry["rating"] and entry["rating"]["avg"] != "\\N":
        rating = float(entry["rating"]["avg"])
        year = int(entry["year"]) if entry["year"].isdigit() else None
        for actor in entry.get("actors", []):
            imdb_rows.append({
                "actor_name": actor[1],
                "rating": rating,
                "year": year
            })

df_imdb = pd.DataFrame(imdb_rows)

In [22]:
imdb_stats = df_imdb.groupby('actor_name').agg(
    avg_rating=('rating', 'mean'),
    movie_count=('rating', 'count'),
    first_year=('year', 'min'),
    last_year=('year', 'max')
).reset_index()
imdb_stats['years_active'] = imdb_stats['last_year'] - imdb_stats['first_year']

In [24]:
tmdb_movies = pd.read_csv('/Users/superjoetendo/Documents/actor-pairing-analysis/tmdb_5000_movies.csv')
tmdb_credits = pd.read_csv('//Users/superjoetendo/Documents/actor-pairing-analysis/tmdb_5000_credits.csv')

In [25]:
tmdb_credits['cast'] = tmdb_credits['cast'].apply(json.loads)
tmdb_cast_rows = []
for _, row in tmdb_credits.iterrows():
    for actor in row['cast']:
        tmdb_cast_rows.append({
            'movie_id': row['movie_id'],
            'actor_name': actor['name']
        })

df_cast = pd.DataFrame(tmdb_cast_rows)

In [26]:
tmdb_movies_subset = tmdb_movies[['id', 'popularity', 'revenue']]
df_tmdb = df_cast.merge(tmdb_movies_subset, left_on='movie_id', right_on='id', how='left')

In [27]:
tmdb_stats = df_tmdb.groupby('actor_name').agg(
    avg_popularity=('popularity', 'mean'),
    total_revenue=('revenue', 'sum')
).reset_index()

In [28]:
merged = imdb_stats.merge(tmdb_stats, on='actor_name', how='inner')
merged = merged[merged['movie_count'] >= 3].copy()