In [1]:
!curl --silent -C - -o name.basics.tsv.gz https://datasets.imdbws.com/name.basics.tsv.gz
!curl --silent -C - -o title.principals.tsv.gz https://datasets.imdbws.com/title.principals.tsv.gz
!curl --silent -C - -o title.basics.tsv.gz https://datasets.imdbws.com/title.basics.tsv.gz
!ls -la *.tsv.gz

-rw-r--r-- 1 root root 251283363 Jul 25 05:52 name.basics.tsv.gz
-rw-r--r-- 1 root root 175083694 Jul 25 05:52 title.basics.tsv.gz
-rw-r--r-- 1 root root 445513559 Jul 25 05:52 title.principals.tsv.gz


In [2]:
import zlib
import io
import pandas as pd

def read_csv(path, **kwargs):
    with open(path, 'rb') as handle:
        raw = handle.read()
    stream = io.BytesIO(zlib.decompress(raw, zlib.MAX_WBITS|16))
    return pd.read_csv(stream, **kwargs)

In [3]:
# Load the movies. This needs ~1.4GB RAM, 15s
movies = read_csv('title.basics.tsv.gz', sep='\t', na_values='\\N', dtype={
    'tconst': 'str',
    'titleType': 'str',
    'primaryTitle': 'str',
    'startYear': 'Int64',
}, usecols=['tconst', 'titleType', 'primaryTitle', 'startYear']).set_index('tconst')

In [4]:
# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = movies[movies['titleType'] == 'movie']
del movies['titleType']
movies.head()

Unnamed: 0_level_0,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000009,Miss Jerry,1894
tt0000147,The Corbett-Fitzsimmons Fight,1897
tt0000502,Bohemios,1905
tt0000574,The Story of the Kelly Gang,1906
tt0000591,The Prodigal Son,1907


In [5]:
# Load the cast of each film. 2.0 GB RAM. 30s
cast = read_csv('title.principals.tsv.gz', sep='\t', na_values='\\N', dtype={
    'tconst': 'str',
    'nconst': 'str',
    'category': 'str',
}, usecols=['tconst', 'nconst', 'category'])

In [6]:
# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
# Only consider actors that have acted in movies, not TV series, etc.
cast = cast[cast.category.isin({'actor', 'actress'}) & cast['tconst'].isin(movies.index)]
cast.reset_index(drop=True, inplace=True)
cast.head()

Unnamed: 0,tconst,nconst,category
0,tt0000009,nm0063086,actress
1,tt0000009,nm0183823,actor
2,tt0000009,nm1309758,actor
3,tt0000502,nm0215752,actor
4,tt0000502,nm0252720,actor


In [7]:
# Load 11m names with birth year. 16s
name = read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N', dtype={
    'nconst': 'str',
    'primaryName': 'str',
    'birthYear': 'Int64'
}, usecols=['nconst', 'primaryName', 'birthYear']).set_index('nconst')

In [8]:
# Drop those who haven't acted in movies
name = name[name.index.isin(cast['nconst'])]
# name['titles'] has the number of movies they've acted in
name['titles'] = cast['nconst'].value_counts()
name.head()

Unnamed: 0_level_0,primaryName,birthYear,titles
nconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nm0000001,Fred Astaire,1899,35
nm0000002,Lauren Bacall,1924,37
nm0000003,Brigitte Bardot,1934,35
nm0000004,John Belushi,1949,7
nm0000005,Ingmar Bergman,1918,3


In [9]:
import networkx as nx
G = nx.from_pandas_edgelist(cast, 'tconst', 'nconst')

In [10]:
# Let's write a function that converts these IDs into names
def names(path):
    return ' - '.join((movies['primaryTitle'][p] if p.startswith('tt') else name['primaryName'][p]) for p in path)

# ... and a function that
def path(source, target):
    source = name[name['primaryName'] == source].index[0]
    target = name[name['primaryName'] == target].index[0]
    return names(nx.shortest_path(G, source, target))

In [11]:

path('Dudley Clements', 'Amélie Joktane')

'Dudley Clements - Too Many Wives - Gene Lockhart - The House on 92nd Street - Signe Hasso - A Double Life - Shelley Winters - Mimì Bluette... fiore del mio giardino - Hella Petri - One Does Not Bury Sunday - Philippe Mory - Les tam tams se sont tus - Amélie Joktane'