The script is used for scraping TMDB info for Natflix Prize dataset.

Please prepare your own TMDB API keys.

In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')

import os
import pandas as pd
from tqdm import tqdm
import os
import requests
import json
import numpy as np
import urllib.parse

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Netflix titles

In [None]:
titles_path = os.path.join("movie_titles.csv")
names = ['item', 'year', 'name']
df_netflix_titles = pd.read_csv(titles_path, header=None, names=names)
df_netflix_titles.head()

Unnamed: 0,item,year,name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [None]:
print(df_netflix_titles.shape[0])
print(df_netflix_titles.drop_duplicates(subset=['name'], keep=False).shape[0])
print(df_netflix_titles.drop_duplicates(subset=['name', 'year'], keep=False).shape[0])

17770
16994
17752


In [None]:
df_duplicated = df_netflix_titles[df_netflix_titles[['name', 'year']].duplicated(keep = False)].sort_values(by=['name'])
df_duplicated.head()

Unnamed: 0,item,year,name
17398,17399,1959,Elvira's Horror Classics
3830,3831,1959,Elvira's Horror Classics
9826,9827,2004,Frankenstein
5202,5203,2004,Frankenstein
3217,3218,1990,Hamlet


# Common tools

In [None]:
with open("tmdb_api_key.txt") as f:
    api_key = f.read()

with open("tmdb_api_token.txt") as f:
    token = f.read()

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {token}"
}

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
class MyException(Exception):
    pass

In [None]:
def get_imdb_id_by_tmdb_id(tmdb_id):
    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}/external_ids"
    response = requests.get(url, headers=headers)
    response = json.loads(response.text)
    return response['imdb_id']

def get_tmdb_id_by_imdb_id(imdb_id):
    url = f"https://api.themoviedb.org/3/find/{imdb_id}?external_source=imdb_id"
    response = requests.get(url, headers=headers)
    response = json.loads(response.text)
    response = response['movie_results']
    return response[0]['id'] if len(response) > 0 else None

In [None]:
df_credits_tmdb = pd.DataFrame(columns=['id', 'name', 'original_name'])

def get_credits_by_tmdb_id(tmdb_id):
    url = "https://api.themoviedb.org/3/movie/{}/credits?language=en-US".format(tmdb_id)
    response = requests.get(url, headers=headers)
    response = json.loads(response.text)
    cast_ids = []
    for cast in response['cast']:
        cast_ids.append(cast['id'])
        if cast['id'] not in df_credits_tmdb['tmdb_cast_id']:
            df_credits_tmdb.loc[len(df_credits_tmdb.index)] = [cast['id'], cast['name'], cast['original_name']]
    return cast_ids

# Setup TMDB

In [None]:
def get_tmdb_id(name, year):
    '''Get TMDB id with name
    '''
    ids, titles, years, scores, imdb_ids = [], [], [], [], []

    query = urllib.parse.quote(name)
    url = f"https://api.themoviedb.org/3/search/movie?query={query}&include_adult=false&language=en-US&page=1"
    response = requests.get(url, headers=headers)
    response = json.loads(response.text)
    res = response['results']

    for i in range(len(res)):
        try:
            id = int(res[i]['id'])
            tt = res[i]['title']
            yr = int(res[i]['release_date'].split('-')[0])
        except:
            continue

        if yr is not None and yr <= 2006:
            diff = abs(year - yr)
            dist = similar(name.lower(), tt.lower())
            score = dist / (1.2 ** diff)
            if score < 0.3:
                continue

            imdb_id = get_imdb_id_by_tmdb_id(id)

            ids.append(id)
            titles.append(tt)
            years.append(yr)
            scores.append(score)
            imdb_ids.append(imdb_id)

    if len(ids) > 1:
        idx = np.flip(np.argsort(scores))

        ids = np.take(ids, idx)
        titles = np.take(titles, idx)
        years = np.take(years, idx)
        scores = np.take(scores, idx)
        imdb_ids = np.take(imdb_ids, idx)

    return (ids, titles, years, scores, imdb_ids)

# Start scraping

In [None]:
df_netflix_tmdb = pd.DataFrame(columns=['netflix_id', 'tmdb_ids', 'netflix_title', 'tmdb_titles', 'netflix_year', 'tmdb_years', 'tmdb_scores', 'imdb_ids'])

for row in tqdm(df_netflix_titles.iterrows(), total=df_netflix_titles.shape[0]):
    try:
        item, year, name = row[1]['item'], row[1]['year'], row[1]['name']

        if item in df_duplicated['item']: # skip duplicated movies with same release year
            raise MyException('duplicated movies')

        ids, titles, years, scores, imdb_ids = get_tmdb_id(name, year)
        df_netflix_tmdb.loc[len(df_netflix_tmdb.index)] = [item, ids, name, titles, year, years, scores, imdb_ids]

    except Exception as error:
        print(error)

  4%|▍         | 694/17770 [03:28<1:26:51,  3.28it/s]

duplicated movies


  8%|▊         | 1505/17770 [09:15<2:01:45,  2.23it/s]

duplicated movies


 18%|█▊        | 3216/17770 [22:30<1:55:15,  2.10it/s]

duplicated movies


 22%|██▏       | 3829/17770 [27:23<1:53:44,  2.04it/s]

duplicated movies


 27%|██▋       | 4870/17770 [35:22<1:26:17,  2.49it/s]

duplicated movies


 29%|██▉       | 5201/17770 [37:56<1:29:49,  2.33it/s]

duplicated movies


 32%|███▏      | 5606/17770 [40:54<1:38:46,  2.05it/s]

duplicated movies


 32%|███▏      | 5710/17770 [41:42<1:40:34,  2.00it/s]

duplicated movies


 34%|███▍      | 6061/17770 [44:24<1:26:58,  2.24it/s]

duplicated movies


 38%|███▊      | 6838/17770 [50:26<1:31:12,  2.00it/s]

duplicated movies


 41%|████▏     | 7349/17770 [54:23<1:19:06,  2.20it/s]

duplicated movies


 54%|█████▍    | 9627/17770 [1:11:41<58:58,  2.30it/s]

duplicated movies


 55%|█████▍    | 9706/17770 [1:12:16<1:16:37,  1.75it/s]

duplicated movies


 55%|█████▌    | 9825/17770 [1:13:15<1:00:12,  2.20it/s]

duplicated movies


 61%|██████    | 10787/17770 [1:20:31<48:05,  2.42it/s]

duplicated movies


 65%|██████▍   | 11528/17770 [1:26:14<44:04,  2.36it/s]

duplicated movies


 66%|██████▌   | 11738/17770 [1:27:53<39:04,  2.57it/s]

duplicated movies


 98%|█████████▊| 17399/17770 [2:12:51<01:30,  4.09it/s]

duplicated movies


100%|██████████| 17770/17770 [2:15:49<00:00,  2.18it/s]


In [None]:
df_netflix_tmdb

Unnamed: 0,netflix_id,tmdb_ids,netflix_title,tmdb_titles,netflix_year,tmdb_years,tmdb_scores,imdb_ids
0,1,[58691],Dinosaur Planet,[Bikini Girls on Dinosaur Planet],2003,[2005],[0.4528985507246377],[tt0447606]
1,2,[],Isle of Man TT 2004 Review,[],2004,[],[],[]
2,3,"[17139, 1151430]",Character,"[Character, CHARACTER]",1997,"[1997, 2002]","[1.0, 0.401877572016461]","[tt0119448, tt3362034]"
3,4,[274766],Paula Abdul's Get Up & Dance,[Paula Abdul's Get Up & Dance],1994,[1994],[1.0],[tt0276981]
4,5,[33209],The Rise and Fall of ECW,[WWE: The Rise + Fall of ECW],2004,[2004],[0.8235294117647058],[tt0440751]
...,...,...,...,...,...,...,...,...
17747,17766,[175484],Where the Wild Things Are and Other Maurice Se...,[Where the Wild Things Are... and other Mauric...,2002,[2001],[0.8123249299719888],[]
17748,17767,[409774],Fidel Castro: American Experience,[Fidel Castro],2004,[2005],[0.4444444444444445],[tt0815934]
17749,17768,[20004],Epoch,[Epoch],2000,[2001],[0.8333333333333334],[tt0233657]
17750,17769,[52036],The Company,[The Company],2003,[2003],[1.0],[tt0335013]


# Export data
Pickle is preferred. CSV will save the lists as strings. To read and parse the CSV correctly, use the script appended in the end.

In [None]:
# save as pickle

df_netflix_tmdb.to_pickle('netflix_tmdb.pickle')

In [None]:
# save as csv

df_netflix_tmdb.to_csv('netflix_tmdb.csv')

In [None]:
# read csv

df_netflix_tmdb = pd.read_csv('netflix_tmdb.csv', index_col=0)
df_netflix_tmdb

In [None]:
# read and parse CSV

converters = {
    "netflix_id": lambda x: int(x),
    "tmdb_ids": lambda x: [int(num) for num in x.strip("[]").split(" ") if num.isdigit()],
    "tmdb_titles": lambda x: x.strip("[]").strip("'").strip('"').replace("' '", "|").replace('" "', '|').split("|"),
    "netflix_year": lambda x: int(x),
    "tmdb_years": lambda x: [int(num) for num in x.strip("[]").split(" ") if num.isdigit()],
    "tmdb_scores": lambda x: [float(num) for num in x.strip("[]").split(" ") if num != ''],
    "imdb_ids": lambda x: x.strip("[]").strip("'").strip('"').replace("' '", "|").replace('" "', '|').split("|")
}

df_netflix_tmdb = pd.read_csv("netflix_tmdb.csv", index_col=0, converters=converters)

df_netflix_tmdb["imdb_ids"] = df_netflix_tmdb.apply(lambda x: [] if len(x['tmdb_scores']) == 0 else x["imdb_ids"], axis=1)
df_netflix_tmdb["tmdb_ids"] = df_netflix_tmdb.apply(lambda x: [] if len(x['tmdb_scores']) == 0 else x["tmdb_ids"], axis=1)

df_netflix_tmdb