# IMDb Scraper

Workflow: 5 

Goal: Start with TMDB ids and get all data from IMDB.

Result: The file ```imdb_movie.csv``` is created.

In [1]:
import os
import time
import pandas as PD

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# can't reload script in a different folder

import sys
sys.path.append('../models')
import imdb_scraper as MB 

## Scrape One Movie

In [4]:
scraper = MB.ImdbScraper()
scraper.open_imdb()

<selenium.webdriver.firefox.webdriver.WebDriver (session="7e208c60-03f1-d345-b0b9-857b3997ef58")>


In [5]:
title = 'Kaaliyan'
year = '2017'
# scraper.search_any(title)

In [6]:
# scraper.goto_movie_result(title, year)

In [7]:
# movie_dx = scraper.get_movie_data()
# movie_dx

In [8]:
movie_dx = scraper.get_movie_by_id('tt3170504')
movie_dx 

{'imdb_id': 'tt3170504',
 'title': 'Last Love Lost',
 'original_title': None,
 'year': '2015',
 'rating': 'Not Rated',
 'companies': None,
 'country': None,
 'language': None,
 'duration': 83,
 'directors': 'Jahmar Hill',
 'writers': 'Jahmar Hill',
 'actors': 'Jahmar Hill, Toni Belafonte, Talli Clemons',
 'genres': 'Drama',
 'synopsis': 'Antonio (Jahmar Hill) is awakened from a coma he was in for a year and a half after being shot by his father while trying to break up a domestic dispute between him and his mother. When ',
 'budget': None,
 'gross_us': None,
 'gross_worldwide': None,
 'score': '7.5',
 'votes': '15'}

In [9]:
scraper.close()

## Scrape Based on TMDB

In [10]:
INDEX_FILE = '../data/moviedb_movie.csv'
index_df = PD.read_csv(INDEX_FILE, dtype={'title': str, 'year': str})
index_df.head(3)
index_df.info()

Unnamed: 0,title,original_title,year,companies,country,language,run_time,crew,cast,poster,genres,collection,synopsis,budget,gross,score,votes,tmdb_id,imdb_id
0,1,1,2013,"Exclusive Media, Flat-Out Films",United States of America,English,112.0,"Paul Crowder, Mark Monroe, Michael Shevloff","Niki Lauda, Michael Schumacher, Lewis Hamilton",/4uIPXX8DjTsCzUAdtMKHTpojYLq.jpg,Documentary,,Set in the golden era of Grand Prix Racing '1'...,,,7.4,59,217316,tt2518788
1,"10,000 BC","10,000 BC",2008,"Centropolis Entertainment, Legendary Entertain...",United States of America,English,109.0,"Roland Emmerich, Sarah Bradshaw, Tom Karnowski","Steven Strait, Camilla Belle, Cliff Curtis",/rnGR3EHkL4ryhQd50XBrtRrV8nq.jpg,"Adventure, Action, Drama, Fantasy",,A prehistoric epic that follows a young mammot...,105000000.0,266000000.0,5.3,1766,7840,tt0443649
2,1000 Rupee Note,Ek Hazarachi Note,2016,,India,Marathi (Marāṭhī),89.0,"Shrihari Sathe, Shrikant Bojewar",Sandeep Pathak,/pNNxwXAReV4kh7TCZGqBrl9I72v.jpg,Drama,,Poor Parobudhi receives a thousand rupee note ...,,,6.9,7,318654,tt2937158


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17668 entries, 0 to 17667
Data columns (total 19 columns):
title             17668 non-null object
original_title    17668 non-null object
year              17668 non-null object
companies         11589 non-null object
country           13571 non-null object
language          17667 non-null object
run_time          16507 non-null float64
crew              16720 non-null object
cast              16002 non-null object
poster            17296 non-null object
genres            16109 non-null object
collection        1189 non-null object
synopsis          17537 non-null object
budget            2863 non-null float64
gross             1908 non-null float64
score             17668 non-null float64
votes             17668 non-null int64
tmdb_id           17668 non-null int64
imdb_id           17456 non-null object
dtypes: float64(4), int64(2), object(13)
memory usage: 2.6+ MB


In [11]:
# create jobs

index_1_df = index_df[0:5349]
index_2_df = index_df[5350:9999]
index_3_df = index_df[10000:13999]
index_4_df = index_df[14000:]

index_1_df.shape
index_2_df.shape
index_3_df.shape
index_4_df.shape

(5349, 19)

(4649, 19)

(3999, 19)

(3668, 19)

In [12]:
# investigate errors

error_titles = ['13 Times Evil', 'There Is a New World Somewhere', 'The Scarlet and the Black', 'The Sleeping Tiger', 
                'The Untold Story', 'Zombiegeddon', 'To Love The Soul Of A Woman', 'Tree Man']
index_df.loc[index_df['title'].isin(error_titles)][['title', 'year', 'imdb_id']]

Unnamed: 0,title,year,imdb_id
56,13 Times Evil,2016,
15415,There Is a New World Somewhere,2015,tt3281502
15524,The Scarlet and the Black,1983,tt0086251
15634,The Sleeping Tiger,1954,tt0047505
15910,The Untold Story,1993,tt0103743
16288,To Love The Soul Of A Woman,2017,
16452,Tree Man,2016,tt3246684
17642,Zombiegeddon,2003,tt0316946


In [13]:
#index_df.tail(20)['title']

In [14]:
scraper = MB.ImdbScraper()
scraper.open_imdb()
movie_ls = []
t0 = time.time()

for idx, row in index_2_df.iterrows():
    title = row['title']
    year = row['year']
    imdb_id = row['imdb_id']
    movie_dx = None

    try:
        movie_dx = scraper.get_movie_by_id(imdb_id)
    except Exception as ex:
        print('')
        print(f'Error 1: {title} {year} {imdb_id}')
        print(ex)

    if not movie_dx:
        try:
            scraper.open_imdb()
            scraper.search_any(title)
            found = scraper.goto_movie_result(title, year)
            if found:
                print(f'found by title: {title} {year} {imdb_id}')
                movie_dx = scraper.get_movie_data()
            else:
                print(f'NOT found by title: {title} {year}')
        except Exception as ex:
            print(f'Error 2: {title} {year} {imdb_id}')
            print(ex)

    if movie_dx:
        movie_ls.append(movie_dx)

scraper.close()
t1 = time.time()
print(f'movies: {len(movie_ls)}')
print(f'time: {(t1-t0)/60/60:.2f} hrs')

<selenium.webdriver.firefox.webdriver.WebDriver (session="25c7f19c-8eba-9e44-9065-67be402151d7")>

Error 1: Food on the Go 2017 tt7321504
get_movie_data(): title error: tt7321504
NOT found by title: Food on the Go 2017

Error 1: Frank Sinatra: A Man and His Music + Ella + Jobim 1967 tt0404055
get_movie_data(): title error: tt0404055
NOT found by title: Frank Sinatra: A Man and His Music + Ella + Jobim 1967

Error 1: Frank Zappa - Freak Jazz, Movie Madness & Another Mothers 2014 nan
Message: Unable to locate element: .title_wrapper

NOT found by title: Frank Zappa - Freak Jazz, Movie Madness & Another Mothers 2014

Error 1: Free Energy - The Race to Zero Point 2008 nan
Message: Unable to locate element: .title_wrapper

NOT found by title: Free Energy - The Race to Zero Point 2008

Error 1: Free Rein: Valentine's Day 2019 tt8382012
get_movie_data(): title error: tt8382012
NOT found by title: Free Rein: Valentine's Day 2019

Error 1: Freezer's Campaign 2016 nan
Message: Unable to locate e

found by title: Kindness Matters 2018 nan

Error 1: King of Scots 2017 nan
Message: Unable to locate element: .title_wrapper

NOT found by title: King of Scots 2017

Error 1: Koursk: Un sous-marin en eaux troubles 2004 nan
Message: Unable to locate element: .title_wrapper

found by title: Koursk: Un sous-marin en eaux troubles 2004 nan

Error 1: Kundanapu Bomma 2016 nan
Message: Unable to locate element: .title_wrapper

found by title: Kundanapu Bomma 2016 nan

Error 1: Kuppivala 2017 nan
Message: Unable to locate element: .title_wrapper

NOT found by title: Kuppivala 2017

Error 1: Leftovers 2017 nan
Message: Unable to locate element: .title_wrapper

found by title: Leftovers 2017 nan

Error 1: LEGO Jurassic World: The Indominus Escape 2016 tt6101862
get_movie_data(): title error: tt6101862
NOT found by title: LEGO Jurassic World: The Indominus Escape 2016

Error 1: Less is More: How to be Happy with Nothing 2013 nan
Message: Unable to locate element: .title_wrapper

Error 2: Less is 

In [15]:
# projected time

print(f'projected time: {(t1-t0)/60/60*18450/index_df.shape[0]:.2f} hrs')

projected time: 8.86 hrs


In [16]:
movie_df = PD.DataFrame(movie_ls)
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4590 entries, 0 to 4589
Data columns (total 19 columns):
imdb_id            4590 non-null object
title              4590 non-null object
original_title     580 non-null object
year               4590 non-null object
rating             3416 non-null object
companies          4227 non-null object
country            4564 non-null object
language           4532 non-null object
duration           4335 non-null float64
directors          4563 non-null object
writers            4160 non-null object
actors             4263 non-null object
genres             4587 non-null object
synopsis           4590 non-null object
budget             1271 non-null object
gross_us           890 non-null object
gross_worldwide    1428 non-null object
score              4562 non-null object
votes              4562 non-null object
dtypes: float64(1), object(18)
memory usage: 681.5+ KB


In [17]:
save_path = r'../data/imdb_mov_2.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'1.68 mb'

## Combine Job Files

In [18]:
#stop = 0 0
imdb_1_df = PD.read_csv('../data/imdb_mov_1.csv')
imdb_2_df = PD.read_csv('../data/imdb_mov_2.csv')
imdb_3_df = PD.read_csv('../data/imdb_mov_3.csv')
imdb_4_df = PD.read_csv('../data/imdb_mov_4.csv')

imdb_1_df.shape
imdb_2_df.shape
imdb_3_df.shape
imdb_4_df.shape

(5358, 19)

(4590, 19)

(317, 19)

(317, 19)

In [19]:
movie_df = PD.concat([imdb_1_df, imdb_2_df, imdb_3_df, imdb_4_df])
movie_df.head(3)
movie_df.info()

Unnamed: 0,imdb_id,title,original_title,year,rating,companies,country,language,duration,directors,writers,actors,genres,synopsis,budget,gross_us,gross_worldwide,score,votes
0,tt2518788,1,,2013,Not Rated,"Diamond Docs, Exclusive Media Group",USA,English,112.0,Paul Crowder,Mark Monroe,"Michael Fassbender, Niki Lauda, Lewis Hamilton","Documentary, History, Sport",Set in the golden era of Grand Prix Racing '1'...,,,,8.0,3535.0
1,tt0443649,"10,000 BC",,2008,PG-13,"Warner Bros., Legendary Entertainment",USA,English,109.0,Roland Emmerich,"Roland Emmerich, Harald Kloser","Camilla Belle, Steven Strait, Marco Khan","Action, Adventure, Drama","In the prehistoric past, D'Leh is a mammoth hu...",105000000.0,94784201.0,269784201.0,5.1,121431.0
2,tt2937158,Ek Hazarachi Note,,2014,Not Rated,Infinitum Productions,India,Marathi,89.0,Shrihari Sathe,Shrikant Bojewar,"Devendra Gaikwad, Usha Naik, Pooja Nayak",Drama,"In this exploration of money and conscience, a...",,2404.0,2404.0,7.3,309.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10582 entries, 0 to 316
Data columns (total 19 columns):
imdb_id            10582 non-null object
title              10582 non-null object
original_title     1312 non-null object
year               10582 non-null int64
rating             7970 non-null object
companies          9764 non-null object
country            10527 non-null object
language           10446 non-null object
duration           10039 non-null float64
directors          10522 non-null object
writers            9619 non-null object
actors             9868 non-null object
genres             10575 non-null object
synopsis           10526 non-null object
budget             3065 non-null object
gross_us           1950 non-null float64
gross_worldwide    3178 non-null object
score              10514 non-null float64
votes              10514 non-null float64
dtypes: float64(4), int64(1), object(14)
memory usage: 1.6+ MB


In [20]:
# remove duplicates

movie_df = movie_df.drop_duplicates()
movie_df = movie_df.drop_duplicates(subset=['imdb_id'])
movie_df.shape

(9955, 19)

In [21]:
# fix up column types

movie_df['duration'] = movie_df['duration'].astype('Int64')
movie_df['votes'] = movie_df['votes'].astype('Int64')

In [22]:
save_path = r'../data/imdb_movie.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'3.64 mb'