The script is used for scraping IMDB info for Natflix Prize dataset.

In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')

import os
import pandas as pd
from tqdm import tqdm
import os
import requests
import json
import numpy as np
import urllib.parse

# Load Netflix titles

In [None]:
titles_path = os.path.join("movie_titles.csv")
names = ['item', 'year', 'name']
df_netflix_titles = pd.read_csv(titles_path, header=None, names=names)
df_netflix_titles.head()

Unnamed: 0,item,year,name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [None]:
print(df_netflix_titles.shape[0])
print(df_netflix_titles.drop_duplicates(subset=['name'], keep=False).shape[0])
print(df_netflix_titles.drop_duplicates(subset=['name', 'year'], keep=False).shape[0])

17770
16994
17752


In [None]:
df_duplicated = df_netflix_titles[df_netflix_titles[['name', 'year']].duplicated(keep = False)].sort_values(by=['name'])
df_duplicated.head()

Unnamed: 0,item,year,name
17398,17399,1959,Elvira's Horror Classics
3830,3831,1959,Elvira's Horror Classics
9826,9827,2004,Frankenstein
5202,5203,2004,Frankenstein
3217,3218,1990,Hamlet


# Common tools

In [None]:
with open("tmdb_api_key.txt") as f:
    api_key = f.read()

with open("tmdb_api_token.txt") as f:
    token = f.read()

headers = {
    "accept": "application/json",
    "Authorization": f"Bearer {token}"
}

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
class MyException(Exception):
    pass

In [None]:
def get_imdb_id_by_tmdb_id(tmdb_id):
    url = f"https://api.themoviedb.org/3/movie/{tmdb_id}/external_ids"
    response = requests.get(url, headers=headers)
    response = json.loads(response.text)
    return response['imdb_id']

def get_tmdb_id_by_imdb_id(imdb_id):
    url = f"https://api.themoviedb.org/3/find/{imdb_id}?external_source=imdb_id"
    response = requests.get(url, headers=headers)
    response = json.loads(response.text)
    response = response['movie_results']
    return response[0]['id'] if len(response) > 0 else None

In [None]:
df_credits_tmdb = pd.DataFrame(columns=['id', 'name', 'original_name'])

def get_credits_by_tmdb_id(tmdb_id):
    url = "https://api.themoviedb.org/3/movie/{}/credits?language=en-US".format(tmdb_id)
    response = requests.get(url, headers=headers)
    response = json.loads(response.text)
    cast_ids = []
    for cast in response['cast']:
        cast_ids.append(cast['id'])
        if cast['id'] not in df_credits_tmdb['tmdb_cast_id']:
            df_credits_tmdb.loc[len(df_credits_tmdb.index)] = [cast['id'], cast['name'], cast['original_name']]
    return cast_ids

# Setup IMDB

In [None]:
!pip install cinemagoer
from imdb import Cinemagoer

# create an instance of the Cinemagoer class
ia = Cinemagoer()

Collecting cinemagoer
  Downloading cinemagoer-2023.5.1-py3-none-any.whl (297 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/297.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.2/297.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cinemagoer
Successfully installed cinemagoer-2023.5.1


In [None]:
def get_imdb_id(name, year):
    '''Get IMDB id with name
    '''
    ids, titles, years, scores, tmdb_ids = [], [], [], [], []

    for trial in range(10):
        try:
            res = ia.search_movie(name)
            break
        except Exception as error:
            res = []
            print(trial, type(error).__name__)

    for i in range(len(res)):
        try:
            id = res[i].getID()
            tt = res[i]['title']
            yr = res[i]['year']
        except:
            continue

        if yr is not None and yr <= 2006:
            diff = abs(year - yr)
            dist = similar(name.lower(), tt.lower())
            score = dist / (1.2 ** diff)
            if score < 0.3:
                continue

            id = f'tt{id}'
            tmdb_id = get_tmdb_id_by_imdb_id(id)

            ids.append(id)
            titles.append(tt)
            years.append(yr)
            scores.append(score)
            tmdb_ids.append(tmdb_id)

    if len(ids) > 1:
        idx = np.flip(np.argsort(scores))

        ids = np.take(ids, idx)
        titles = np.take(titles, idx)
        years = np.take(years, idx)
        scores = np.take(scores, idx)
        tmdb_ids = np.take(tmdb_ids, idx)

    return (ids, titles, years, scores, tmdb_ids)

In [None]:
df_netflix_titles.iloc[0]

item                  1
year               2003
name    Dinosaur Planet
Name: 0, dtype: object

In [None]:
def match_netflix_imdb(i):
    try:
        row = df_netflix_titles.iloc[i]

        item, year, name = row['item'], row['year'], row['name']

        if item in df_duplicated['item']: # skip duplicated movies with same release year
            ids, titles, years, scores, tmdb_ids = [], [], [], [], []
        else:
            ids, titles, years, scores, tmdb_ids = get_imdb_id(name, year)
        df_netflix_imdb.loc[len(df_netflix_imdb.index)] = [item, ids, name, titles, year, years, scores, tmdb_ids]
    except Exception as error:
        print(item, error)
    return item # return something as a signal

# Start scraping

In [None]:
df_netflix_imdb = pd.DataFrame(columns=['netflix_id', 'imdb_ids', 'netflix_title', 'imdb_titles', 'netflix_year', 'imdb_years', 'imdb_scores', 'tmdb_ids'])

In [None]:
from tqdm.contrib.concurrent import thread_map

r = thread_map(match_netflix_imdb, [i for i in range(17770)])

  0%|          | 0/17770 [00:00<?, ?it/s]

2023-12-27 15:07:27,850 CRITICAL [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Simpatico&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': TimeoutError('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/usr/lib/python3.10/urllib/request.py", line 519, in open
    response = self._open(req, data)
  File "/usr/lib/python3.10/urllib/request.py", line 536, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/usr/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 1391, in https_open
    return self.do_open(http.client.HTTPSConnectio

0 IMDbDataAccessError


2023-12-27 15:11:04,151 CRITICAL [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Happy+End&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 502: 'Bad Gateway'>},); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/usr/lib/python3.10/urllib/request.py", line 525, in open
    response = meth(req, response)
  File "/usr/lib/python3.10/urllib/request.py", line 634, in http_response
    response = self.parent.error(
  File "/usr/lib/python3.10/urllib/request.py", line 563, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 643, in htt

0 IMDbDataAccessError


2023-12-27 15:28:04,534 CRITICAL [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Road+to+Perdition&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 502: 'Bad Gateway'>},); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/usr/lib/python3.10/urllib/request.py", line 525, in open
    response = meth(req, response)
  File "/usr/lib/python3.10/urllib/request.py", line 634, in http_response
    response = self.parent.error(
  File "/usr/lib/python3.10/urllib/request.py", line 563, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 643

0 IMDbDataAccessError


2023-12-27 15:39:19,172 CRITICAL [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Boomerang&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 502: 'Bad Gateway'>},); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/usr/lib/python3.10/urllib/request.py", line 525, in open
    response = meth(req, response)
  File "/usr/lib/python3.10/urllib/request.py", line 634, in http_response
    response = self.parent.error(
  File "/usr/lib/python3.10/urllib/request.py", line 563, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 643, in htt

0 IMDbDataAccessError
5824 Shape of passed values is (5806, 8), indices imply (5805, 8)


2023-12-27 15:52:53,474 CRITICAL [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Five+Easy+Pieces&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': TimeoutError('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/usr/lib/python3.10/urllib/request.py", line 519, in open
    response = self._open(req, data)
  File "/usr/lib/python3.10/urllib/request.py", line 536, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/usr/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 1391, in https_open
    return self.do_open(http.client.HTTPSCo

0 IMDbDataAccessError


2023-12-27 16:06:34,672 CRITICAL [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=O&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>},); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/usr/lib/python3.10/urllib/request.py", line 525, in open
    response = meth(req, response)
  File "/usr/lib/python3.10/urllib/request.py", line 634, in http_response
    response = self.parent.error(
  File "/usr/lib/python3.10/urllib/request.py", line 563, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 643, in h

0 IMDbDataAccessError


2023-12-27 16:06:45,979 CRITICAL [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=O&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>},); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/usr/lib/python3.10/urllib/request.py", line 525, in open
    response = meth(req, response)
  File "/usr/lib/python3.10/urllib/request.py", line 634, in http_response
    response = self.parent.error(
  File "/usr/lib/python3.10/urllib/request.py", line 563, in error
    return self._call_chain(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 643, in h

1 IMDbDataAccessError
9514 Shape of passed values is (9486, 8), indices imply (9485, 8)
15554 Shape of passed values is (15488, 8), indices imply (15487, 8)


2023-12-27 17:17:21,861 CRITICAL [imdbpy] /usr/local/lib/python3.10/dist-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=Dark+Blue+World&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': TimeoutError('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/imdb/parser/http/__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "/usr/lib/python3.10/urllib/request.py", line 519, in open
    response = self._open(req, data)
  File "/usr/lib/python3.10/urllib/request.py", line 536, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/usr/lib/python3.10/urllib/request.py", line 496, in _call_chain
    result = func(*args)
  File "/usr/lib/python3.10/urllib/request.py", line 1391, in https_open
    return self.do_open(http.client.HTTPSCon

0 IMDbDataAccessError
17485 Shape of passed values is (17414, 8), indices imply (17413, 8)


# Fix missing movies during scraping
repeat several times untill there's no missing movies

In [None]:
# fix missing movies during scraping (1/2) repeat several times untill there's no missing movies

print(sum(~df_netflix_titles['item'].isin(df_netflix_imdb['netflix_id'])))

missing_list = df_netflix_titles['item'][~df_netflix_titles['item'].isin(df_netflix_imdb['netflix_id'])]

73

In [None]:
# fix missing movies during scraping (2/2) repeat several times untill there's no missing movies

from tqdm.contrib.concurrent import thread_map

r = thread_map(match_netflix_imdb, [i for i in missing_list.index])

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
df_netflix_imdb

Unnamed: 0,netflix_id,imdb_ids,netflix_title,imdb_titles,netflix_year,imdb_years,imdb_scores,tmdb_ids
0,1,['tt0389605' 'tt0447606' 'tt11623870'],Dinosaur Planet,['Dinosaur Planet' 'Bikini Girls on Dinosaur P...,2003,[2003 2005 2003],[1. 0.45289855 0.43478261],[None 58691 None]
1,2,[],Isle of Man TT 2004 Review,[],2004,[],[],[]
2,3,['tt0119448' 'tt0447653'],Character,['Character' 'Kids for Character'],1997,[1997 1996],[1. 0.55555556],[17139 None]
3,4,['tt0276981' 'tt6066714'],Paula Abdul's Get Up & Dance,['Get Up and Dance!' 'Sesame Street: Get Up an...,1994,[1994 1997],[0.57777778 0.33349027],[274766 None]
4,5,['tt0440751' 'tt0864944'],The Rise and Fall of ECW,['The Rise & Fall of ECW' 'Ancient Rome: The R...,2004,[2004 2006],[0.91304348 0.44934641],[33209 None]
...,...,...,...,...,...,...,...,...
17765,16180,[tt12411860],Goosebumps: Deep Trouble,[Goosebumps Audiobook - Deep Trouble],1998,[1996],[0.5414312617702448],[None]
17766,17367,[tt0331525],The Intended,[The Intended],2002,[2002],[1.0],[205481]
17767,17341,[tt0086637],Zelig,[Zelig],1983,[1983],[1.0],[11030]
17768,17485,"[tt0410953, tt0391406]",Prom Queen,"[Prom Queen: The Marc Hall Story, The Prom Queen]",2004,"[2004, 2000]","[0.4878048780487805, 0.40187757201646096]","[40056, 762535]"


# Sort table and reset index adding missing movies

In [None]:
df_netflix_imdb = df_netflix_imdb.reset_index(inplace=True)

In [None]:
df_netflix_imdb = df_netflix_imdb.sort_values(by=['netflix_id'])

# Export data
Pickle is preferred. CSV will save the lists as strings. To read and parse the CSV correctly, use the script appended in the end.

In [None]:
# save as pickle

df_netflix_imdb.to_pickle('netflix_imdb.pickle')

In [None]:
# save as csv

df_netflix_imdb.to_csv('netflix_imdb.csv')

In [None]:
# read csv

df_netflix_imdb = pd.read_csv('netflix_imdb.csv', index_col=0)
df_netflix_imdb

Unnamed: 0,netflix_id,imdb_ids,netflix_title,imdb_titles,netflix_year,imdb_years,imdb_scores,tmdb_ids
0,1,['tt0389605' 'tt0447606' 'tt11623870'],Dinosaur Planet,['Dinosaur Planet' 'Bikini Girls on Dinosaur P...,2003,[2003 2005 2003],[1. 0.45289855 0.43478261],[None 58691 None]
1,2,[],Isle of Man TT 2004 Review,[],2004,[],[],[]
2,3,['tt0119448' 'tt0447653'],Character,['Character' 'Kids for Character'],1997,[1997 1996],[1. 0.55555556],[17139 None]
3,4,['tt0276981' 'tt6066714'],Paula Abdul's Get Up & Dance,['Get Up and Dance!' 'Sesame Street: Get Up an...,1994,[1994 1997],[0.57777778 0.33349027],[274766 None]
4,5,['tt0440751' 'tt0864944'],The Rise and Fall of ECW,['The Rise & Fall of ECW' 'Ancient Rome: The R...,2004,[2004 2006],[0.91304348 0.44934641],[33209 None]
...,...,...,...,...,...,...,...,...
17765,17766,[],Where the Wild Things Are and Other Maurice Se...,[],2002,[],[],[]
17766,17767,[],Fidel Castro: American Experience,[],2004,[],[],[]
17767,17768,['tt0233657' 'tt0282466'],Epoch,['Epoch' 'Epoch of Lotus'],2000,[2001 2000],[0.83333333 0.52631579],[ 20004 162490]
17768,17769,['tt0335013' 'tt0385267' 'tt0995011'],The Company,['The Company' 'In Good Company' 'The Electric...,2003,[2003 2004 2006],[1. 0.51282051 0.41069295],[52036 1901 None]


In [None]:
# read and parse CSV

converters = {
    "netflix_id": lambda x: int(x),
    "imdb_ids": lambda x: x.strip("[]").strip("'").strip('"').replace("' '", "|").replace('" "', '|').split("|"),
    "imdb_titles": lambda x: x.strip("[]").strip("'").strip('"').replace("' '", "|").replace('" "', '|').split("|"),
    "netflix_year": lambda x: int(x),
    "imdb_years": lambda x: [int(num) if num.isdigit() else None for num in x.strip("[]").split(" ")],
    "imdb_scores": lambda x: [float(num) for num in x.strip("[]").split(" ") if num != ''],
    "tmdb_ids": lambda x: [int(num) if num.isdigit() else None for num in x.strip("[]").split(" ")]
}

df_netflix_imdb = pd.read_csv("netflix_imdb.csv", index_col=0, converters=converters)

df_netflix_imdb["imdb_ids"] = df_netflix_imdb.apply(lambda x: [] if len(x['imdb_scores']) == 0 else x["imdb_ids"], axis=1)
df_netflix_imdb["tmdb_ids"] = df_netflix_imdb.apply(lambda x: [] if len(x['imdb_scores']) == 0 else x["tmdb_ids"], axis=1)

df_netflix_imdb