In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
from slugify import slugify

In [3]:
%matplotlib inline

In [4]:
credits = pd.read_csv('./data/tmdb_5000_credits.csv')
credits

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
5,559,Spider-Man 3,"[{""cast_id"": 30, ""character"": ""Peter Parker / ...","[{""credit_id"": ""52fe4252c3a36847f80151a5"", ""de..."
6,38757,Tangled,"[{""cast_id"": 34, ""character"": ""Flynn Rider (vo...","[{""credit_id"": ""52fe46db9251416c91062101"", ""de..."
7,99861,Avengers: Age of Ultron,"[{""cast_id"": 76, ""character"": ""Tony Stark / Ir...","[{""credit_id"": ""55d5f7d4c3a3683e7e0016eb"", ""de..."
8,767,Harry Potter and the Half-Blood Prince,"[{""cast_id"": 3, ""character"": ""Harry Potter"", ""...","[{""credit_id"": ""52fe4273c3a36847f801fab1"", ""de..."
9,209112,Batman v Superman: Dawn of Justice,"[{""cast_id"": 18, ""character"": ""Bruce Wayne / B...","[{""credit_id"": ""553bf23692514135c8002886"", ""de..."


In [5]:
# Get movies we have transcripts for (titles kinda off)
transcripts = os.listdir("./data/scripts")

def parse_title(title):
    title = title[:-4].replace("-", " ")
    if title[-5:] == ", The":
        title = "The " + title[:-5] 
    return title

tran_movies = [parse_title(title) for title in transcripts]
cred_movies = credits["title"].tolist()
print(len(tran_movies), "transcript movies")
print(len(cred_movies), "kaggle movies")

1122 transcript movies
4803 kaggle movies


In [6]:
# Match movies
valid_movies = set([])
valid_tran_movies = set([])
for tran_movie in tran_movies:
    for cred_movie in cred_movies:
        clean_tran_movie = re.sub(r'\W+', '', tran_movie).lower()
        clean_cred_movie = re.sub(r'\W+', '', cred_movie).lower()
        if clean_tran_movie == clean_cred_movie[:len(clean_tran_movie)]:
            valid_movies.add(cred_movie)
            valid_tran_movies.add(tran_movie)
print(len(valid_movies))

996


In [7]:
# Print missing movies really inefficiently
# for movie in sorted(tran_movies):
#     if movie not in valid_tran_movies:
#         print(movie)

In [17]:
# Aggrigate casts from all movies
actors = set([])
valid_credits = credits[credits["title"].isin(valid_movies)]
for idx, credit in valid_credits.iterrows():
    try:
        names = pd.read_json(valid_credits["cast"][idx])["name"].tolist()
    except KeyError as e:
        print("Error getting cast for", credit["title"])
        pass
    actors.update(names)
actors = sorted(map(slugify, actors))
print(len(actors))

Error getting cast for Gory Gory Hallelujah
19327


In [18]:
# Write names to file
with open("./data/actors.txt", "w+") as f:
    for actor in actors:
        f.write(actor + "\n")

In [13]:
valid_credits[["title", "cast"]].to_csv("./data/movies.csv", encoding='utf-8')