# Reelgood Scraper

Workflow: 2  

Goal: Start with the list of reelgood movie urls and create a subsample set of scraped movies.   

Result: The file ```reelgood_subsample.csv``` is created.

In [1]:
import os
import time
import pandas as PD

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# can't reload script in a different folder

import sys
sys.path.append('../')
import reelgood_scraper as RS 

## Scrape One Movie

In [4]:
scraper = RS.ReelgoodScraper()
scraper.log_in()

<selenium.webdriver.firefox.webdriver.WebDriver (session="78582487-6ad3-f447-a869-a66c47b29b1a")>
https://reelgood.com/?login=true


In [5]:
t0 = time.time()
movie_dx = scraper.get_movie_data('star-wars-the-last-jedi-2017')
movie_dx

{'title': 'Star Wars: The Last Jedi',
 'year': '2017',
 'duration': 152,
 'rating': '13+ (PG-13)',
 'country': 'America',
 'poster': 'https://img.reelgood.com/content/movie/be6f57cc-b68a-4fb0-aa3e-077cd1c6e51c/poster-780.jpg',
 'genres': 'Action & Adventure, Fantasy, Science-Fiction',
 'tags': 'Military, War, Space',
 'imdb_score': '7.1',
 'rt_score': '91%',
 'synopsis': 'Rey develops her newly discovered abilities with the guidance of Luke Skywalker, who is unsettled by the strength of her powers. Meanwhile, the Resistance prepares to do battle with the First Order.',
 'services': '{"reelgood": "star-wars-the-last-jedi-2017", "netflix": "80192018"}'}

In [6]:
t1 = time.time()
print(f'time: {(t1-t0):.2f} secs')

time: 5.62 secs


In [7]:
scraper.close()

## Scrape Movie Subsample 

In [8]:
URL_FILE = '../data/reelgood_url.csv'
url_df = PD.read_csv(URL_FILE)
url_df.head()
url_df.info()

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
1,"10,000 BC",2008,10000-bc-2008
2,1000 Rupee Note,2016,1000-rupee-note-2016
3,1000 To 1,2014,1000-to-1-2014
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,100-and-a-tshirt-a-documentary-about-zines-in-...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18496 entries, 0 to 18495
Data columns (total 3 columns):
title          18496 non-null object
year           18496 non-null int64
reelgood_id    18496 non-null object
dtypes: int64(1), object(2)
memory usage: 433.6+ KB


In [9]:
# create subsample urls

subsample_df = url_df.iloc[::57]
subsample_df.head()
subsample_df.shape

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
57,13 Times Evil,2016,13-times-evil-2016
114,21 Days,2014,21-days-2014
171,36 Hour Layover,2016,36-hour-layover-2016
228,666: Devilish Charm,2014,666-devilish-charm-2014


(325, 3)

In [10]:
scraper = RS.ReelgoodScraper()
scraper.log_in()

<selenium.webdriver.firefox.webdriver.WebDriver (session="962fc565-4907-0d43-a1ce-5ee31bbbb7bf")>
https://reelgood.com/?login=true


In [11]:
t0 = time.time()
movie_ls = []

for idx, row in subsample_df.iterrows():
    rg_id = row['reelgood_id']
    movie_ls.append(scraper.get_movie_data(rg_id))

scraper.close()
t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

time: 25.36 mins


In [12]:
movie_df = PD.DataFrame(movie_ls)
movie_df.head()
movie_df.info()

Unnamed: 0,title,year,duration,rating,country,poster,genres,tags,imdb_score,rt_score,synopsis,services
0,1,2013,112.0,13+ (PG-13),America,https://img.reelgood.com/content/movie/9b61df6...,"Documentary, Mystery, Thriller, Science-Fictio...","Car, Racing",8.0,,Set in the golden era of Grand Prix Racing '1'...,"{""reelgood"": ""1-2013"", ""amazon"": ""B0751Q3J8P""}"
1,13 Times Evil,2016,90.0,16+,,https://img.reelgood.com/content/movie/9c932c9...,Documentary,,5.3,,History is replete with psychotic killers with...,"{""reelgood"": ""13-times-evil-2016"", ""amazon"": ""..."
2,21 Days,2014,89.0,18+ (R),,https://img.reelgood.com/content/movie/96cf3fe...,"Horror, Thriller, Mystery",,4.5,,Three filmmakers embark on a paranormal challe...,"{""reelgood"": ""21-days-2014"", ""amazon"": ""B01N9Z..."
3,36 Hour Layover,2016,88.0,,,https://img.reelgood.com/content/movie/2674b38...,"Comedy, Romance",,5.9,,This film is a romantic comedy about a steward...,"{""reelgood"": ""36-hour-layover-2016"", ""amazon"":..."
4,666: Devilish Charm,2014,75.0,,,https://img.reelgood.com/content/movie/d525d8c...,"Thriller, Horror, Mystery",,3.4,,When a cursed devil's charm' bracelet shows up...,"{""reelgood"": ""666-devilish-charm-2014"", ""amazo..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 12 columns):
title         325 non-null object
year          325 non-null object
duration      319 non-null float64
rating        138 non-null object
country       121 non-null object
poster        325 non-null object
genres        318 non-null object
tags          141 non-null object
imdb_score    317 non-null object
rt_score      97 non-null object
synopsis      325 non-null object
services      325 non-null object
dtypes: float64(1), object(11)
memory usage: 30.6+ KB


In [13]:
# projected scrape time

print(f'time: {(t1-t0)/subsample_df.shape[0]*url_df.shape[0]/60/60:.2f} hrs')

time: 24.05 hrs


In [14]:
save_path = r'../data/reelgood_subsample.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.18 mb'