# TMDB Scraper

Workflow: 4  

Goal: Start with the list of reelgood movie titles and get their data from the TMDB.   

Result: The file ```reelgood_subsample.csv``` is created.

In [1]:
import os
import time
import pandas as PD

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# can't reload script in a different folder

import sys
sys.path.append('../models')
import moviedb_scraper as RS 

## Scrape One Movie

In [4]:
scraper = RS.ReelgoodScraper()
scraper.log_in()

<selenium.webdriver.firefox.webdriver.WebDriver (session="0930a45e-a474-2440-8afd-60c535d3e6c2")>
https://reelgood.com/?login=true


In [5]:
t0 = time.time()
movie_dx = scraper.get_movie_data('star-wars-the-last-jedi-2017')
movie_dx

{'title': 'Star Wars: The Last Jedi',
 'year': '2017',
 'duration': 152,
 'rating': '13+ (PG-13)',
 'country': 'America',
 'poster': 'https://img.reelgood.com/content/movie/be6f57cc-b68a-4fb0-aa3e-077cd1c6e51c/poster-780.jpg',
 'genres': 'Action & Adventure, Fantasy, Science-Fiction',
 'tags': 'Military, War, Space',
 'imdb_score': '7.1',
 'rt_score': '91%',
 'synopsis': 'Rey develops her newly discovered abilities with the guidance of Luke Skywalker, who is unsettled by the strength of her powers. Meanwhile, the Resistance prepares to do battle with the First Order.',
 'services': '{"reelgood": "star-wars-the-last-jedi-2017", "netflix": "80192018"}'}

In [6]:
t1 = time.time()
print(f'time: {(t1-t0):.2f} secs')

time: 6.13 secs


In [7]:
scraper.close()

## Scrape Movie Subsample 

In [8]:
URL_FILE = '../data/reelgood_url.csv'
url_df = PD.read_csv(URL_FILE)
url_df.head()
url_df.info()

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
1,"10,000 BC",2008,10000-bc-2008
2,1000 Rupee Note,2016,1000-rupee-note-2016
3,1000 To 1,2014,1000-to-1-2014
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,100-and-a-tshirt-a-documentary-about-zines-in-...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18496 entries, 0 to 18495
Data columns (total 3 columns):
title          18496 non-null object
year           18496 non-null int64
reelgood_id    18496 non-null object
dtypes: int64(1), object(2)
memory usage: 433.6+ KB


In [9]:
# create subsample urls

subsample_df = url_df.iloc[::3000]
subsample_df.head()
subsample_df.shape

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
3000,Caged No More,2016,caged-no-more-2016
6000,"George A. Romero Presents: Deadtime Stories, V...",2010,george-a-romero-presents-deadtime-stories-vol-...
9000,Love Games,2016,love-games-2016
12000,Roller Dreams,2017,roller-dreams-2017


(7, 3)

In [10]:
scraper = RS.ReelgoodScraper()
scraper.log_in()

<selenium.webdriver.firefox.webdriver.WebDriver (session="ae7558d4-a4dc-594e-80fe-1d9e734d4328")>
https://reelgood.com/?login=true


In [11]:
t0 = time.time()
movie_ls = []

for idx, row in subsample_df.iterrows():
    rg_id = row['reelgood_id']
    movie_ls.append(scraper.get_movie_data(rg_id))

scraper.close()
t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

time: 0.58 mins


In [12]:
movie_df = PD.DataFrame(movie_ls)
movie_df.head()
movie_df.info()

Unnamed: 0,title,year,duration,rating,country,poster,genres,tags,imdb_score,rt_score,synopsis,services
0,1,2013,112.0,13+ (PG-13),America,https://img.reelgood.com/content/movie/9b61df6...,"Documentary, Mystery, Thriller, Science-Fictio...","Car, Racing",8.0,,Set in the golden era of Grand Prix Racing '1'...,"{""reelgood"": ""1-2013"", ""amazon"": ""B0751Q3J8P""}"
1,Caged No More,2016,90.0,13+ (PG-13),Greece,https://img.reelgood.com/content/movie/1ebd7e3...,"Action & Adventure, Drama, Thriller, Mystery","Greece, Religion",5.4,,"Aggie, A 67 year-old Black Cajun, has just ste...","{""reelgood"": ""caged-no-more-2016"", ""amazon"": ""..."
2,"George A. Romero Presents: Deadtime Stories, V...",2010,,,,https://img.reelgood.com/content/movie/c83ef36...,,,,,"Horror master George A. Romero, writer-directo...","{""reelgood"": ""george-a-romero-presents-deadtim..."
3,Love Games,2016,114.0,,India,https://img.reelgood.com/content/movie/6935055...,"Romance, Mystery, Thriller",Mature,4.3,,A pair of nymphomaniacs compete in seducing co...,"{""reelgood"": ""love-games-2016"", ""amazon"": ""B06..."
4,Roller Dreams,2017,82.0,,,https://img.reelgood.com/content/movie/e34f31f...,Documentary,,7.8,86%,"It’s 1984 and Venice Beach, CA, is at the epic...","{""reelgood"": ""roller-dreams-2017"", ""hulu"": ""ro..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 12 columns):
title         7 non-null object
year          7 non-null object
duration      6 non-null float64
rating        2 non-null object
country       4 non-null object
poster        7 non-null object
genres        5 non-null object
tags          4 non-null object
imdb_score    6 non-null object
rt_score      1 non-null object
synopsis      7 non-null object
services      7 non-null object
dtypes: float64(1), object(11)
memory usage: 800.0+ bytes


In [13]:
# projected scrape time

print(f'time: {(t1-t0)/subsample_df.shape[0]*url_df.shape[0]/60/60:.2f} hrs')

time: 25.68 hrs


In [21]:
save_path = r'/Users/Phil/Documents/Websites/Movies_Proj/Filmophile/backend/movies/notebooks/reelgood_test.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.0 mb'

In [18]:
! pwd

/Users/Phil/Documents/Websites/Movies_Proj/Filmophile/backend/movies/notebooks


In [20]:
os.path.abspath(os.path.dirname(''))

'/Users/Phil/Documents/Websites/Movies_Proj/Filmophile/backend/movies/notebooks'