In [2]:
import pandas as pd
import json
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'pandas'

In [None]:
with open('imdb_movies_2000to2022.prolific.json', 'r') as f:
    imdb_data = [json.loads(line) for line in f]

In [None]:
imdb_rows = []
for entry in imdb_data:
    if "rating" in entry and "avg" in entry["rating"] and entry["rating"]["avg"] != "\\N":
        rating = float(entry["rating"]["avg"])
        year = int(entry["year"]) if entry["year"].isdigit() else None
        for actor in entry.get("actors", []):
            imdb_rows.append({
                "actor_name": actor[1],
                "rating": rating,
                "year": year
            })

df_imdb = pd.DataFrame(imdb_rows)

In [None]:
imdb_stats = df_imdb.groupby('actor_name').agg(
    avg_rating=('rating', 'mean'),
    movie_count=('rating', 'count'),
    first_year=('year', 'min'),
    last_year=('year', 'max')
).reset_index()
imdb_stats['years_active'] = imdb_stats['last_year'] - imdb_stats['first_year']

In [None]:
tmdb_movies = pd.read_csv('/content/tmdb_5000_movies.csv')
tmdb_credits = pd.read_csv('/content/tmdb_5000_credits.csv')

In [None]:
tmdb_credits['cast'] = tmdb_credits['cast'].apply(json.loads)
tmdb_cast_rows = []
for _, row in tmdb_credits.iterrows():
    for actor in row['cast']:
        tmdb_cast_rows.append({
            'movie_id': row['movie_id'],
            'actor_name': actor['name']
        })

df_cast = pd.DataFrame(tmdb_cast_rows)

In [None]:
tmdb_movies_subset = tmdb_movies[['id', 'popularity', 'revenue']]
df_tmdb = df_cast.merge(tmdb_movies_subset, left_on='movie_id', right_on='id', how='left')

In [None]:
tmdb_stats = df_tmdb.groupby('actor_name').agg(
    avg_popularity=('popularity', 'mean'),
    total_revenue=('revenue', 'sum')
).reset_index()

In [None]:
merged = imdb_stats.merge(tmdb_stats, on='actor_name', how='inner')
merged = merged[merged['movie_count'] >= 3].copy()

In [None]:
features = merged[['avg_rating', 'movie_count', 'avg_popularity', 'total_revenue']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

In [None]:
kmeans = KMeans(n_clusters=4, random_state=42)
merged['cluster'] = kmeans.fit_predict(X_scaled)

In [None]:
cluster_labels = {
    0: "Consistent Performers",
    1: "Up-and-Comers",
    2: "Veterans",
    3: "Critically Acclaimed"
}
merged['cluster_name'] = merged['cluster'].map(cluster_labels)

In [None]:
plot_features = ['avg_rating', 'movie_count', 'avg_popularity', 'total_revenue']
for feature in plot_features:
    plt.figure(figsize=(10, 5))
    sns.violinplot(data=merged, x='cluster_name', y=feature, palette='Set2', inner='quartile')
    plt.title(f'{feature.replace("_", " ").title()} by Cluster')
    plt.xlabel('Actor Cluster')
    plt.ylabel(feature.replace("_", " ").title())
    plt.grid(True, linestyle='--', alpha=0.4)
    plt.tight_layout()
    plt.show()

In [None]:
cluster_summary = merged.groupby('cluster_name').agg({
    'avg_rating': 'mean',
    'movie_count': 'mean',
    'avg_popularity': 'mean',
    'total_revenue': 'mean',
    'actor_name': 'count'
}).rename(columns={'actor_name': 'actor_count'})

display(cluster_summary)