# Reelgood Scraper

Workflow: 1  

Goal: Get reelgood urls for every movie in my services.  

Result: The file ```reelgood_url.csv``` is created.

In [1]:
import os
import time
import pandas as PD

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# can't reload script in a different folder

import sys
sys.path.append('../models')
import reelgood_scraper as RS 

## Scrape One URL Page

In [4]:
scraper = RS.ReelgoodScraper()

<selenium.webdriver.firefox.webdriver.WebDriver (session="b1de1a86-c4f6-4e4e-aa0a-fd8d7f50eec3")>


In [5]:
scraper.log_in()

https://reelgood.com/?login=true


In [6]:
t0 = time.time()
scraper.to_next_page()

True

In [7]:
movie_url_ls = scraper.get_movie_urls()
movie_url_ls[:3]

[{'title': '1', 'year': '2013', 'reelgood_id': '1-2013'},
 {'title': '10,000 BC', 'year': '2008', 'reelgood_id': '10000-bc-2008'},
 {'title': '1000 Rupee Note',
  'year': '2016',
  'reelgood_id': '1000-rupee-note-2016'}]

In [8]:
t1 = time.time()
print(f'time: {(t1-t0):.2f} secs')

time: 8.24 secs


In [9]:
urls_df = PD.DataFrame(movie_url_ls)
urls_df.head()
urls_df.info()

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
1,"10,000 BC",2008,10000-bc-2008
2,1000 Rupee Note,2016,1000-rupee-note-2016
3,1000 To 1,2014,1000-to-1-2014
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,100-and-a-tshirt-a-documentary-about-zines-in-...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
title          50 non-null object
year           50 non-null object
reelgood_id    50 non-null object
dtypes: object(3)
memory usage: 1.3+ KB


In [10]:
# last offset is 18600, for 373 pages

scraper.current_offset = 18600
scraper.to_next_page()
scraper.to_next_page()

True

False

In [11]:
scraper.close()

In [12]:
# projected time is time_for_1_page * number_of_pages

print(f'projected time: {(t1-t0)*370/60:.2f} mins')

projected time: 50.81 mins


## Scrape All URL Pages

In [13]:
# batch 1

t0 = time.time()
scraper = RS.ReelgoodScraper()
scraper.log_in()

scraper.current_offset = 0
url_1_ls = scraper.get_all_urls(4500)
scraper.close()

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

<selenium.webdriver.firefox.webdriver.WebDriver (session="c1fadfb4-061f-ae43-b851-6adc1603ffc1")>
https://reelgood.com/?login=true
offset: 500
offset: 1000
offset: 1500
offset: 2000
offset: 2500
offset: 3000
offset: 3500
offset: 4000
offset: 4500
time: 38.71 mins


In [14]:
url_1_df = PD.DataFrame(url_1_ls)
url_1_df.head()
url_1_df.shape

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
1,"10,000 BC",2008,10000-bc-2008
2,1000 Rupee Note,2016,1000-rupee-note-2016
3,1000 To 1,2014,1000-to-1-2014
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,100-and-a-tshirt-a-documentary-about-zines-in-...


(4500, 3)

In [15]:
save_path = r'../data/reelgood_url_1.csv'
url_1_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.2 mb'

In [16]:
# batch 2

t0 = time.time()
scraper = RS.ReelgoodScraper()
scraper.log_in()

scraper.current_offset = 4550
url_2_ls = scraper.get_all_urls(9000)
scraper.close()

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

<selenium.webdriver.firefox.webdriver.WebDriver (session="d93abc04-fb66-d149-9c00-8bbed1ec3272")>
https://reelgood.com/?login=true
offset: 5000
offset: 5500
offset: 6000
offset: 6500
offset: 7000
offset: 7500
offset: 8000
offset: 8500
offset: 9000
time: 34.44 mins


In [17]:
url_2_df = PD.DataFrame(url_2_ls)
url_2_df.head()
url_2_df.shape

Unnamed: 0,title,year,reelgood_id
0,Dirty Gertie from Harlem U.S.A.,1946,dirty-gertie-from-harlem-usa-1946
1,Dirty Ho,1979,dirty-ho-1979
2,Dirty Lies,2016,dirty-lies-2016
3,Dirty Love,2005,dirty-love-2005
4,Dirtymoney,2013,dirty-money-2012


(4450, 3)

In [18]:
save_path = r'../data/reelgood_url_2.csv'
url_2_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.2 mb'

In [19]:
# batch 3

t0 = time.time()
scraper = RS.ReelgoodScraper()
scraper.log_in()

scraper.current_offset = 9050
url_3_ls = scraper.get_all_urls(13500)
scraper.close()

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

<selenium.webdriver.firefox.webdriver.WebDriver (session="ccc5ae60-0cfa-664b-913c-6d10bb0aab9b")>
https://reelgood.com/?login=true
offset: 9500
offset: 10000
offset: 10500
offset: 11000
offset: 11500
offset: 12000
offset: 12500
offset: 13000
offset: 13500
time: 45.71 mins


In [20]:
url_3_df = PD.DataFrame(url_3_ls)
url_3_df.head()
url_3_df.shape

Unnamed: 0,title,year,reelgood_id
0,Lost Treasure,2003,lost-treasure-2003
1,Lost Treasures of the Silk Road,2013,lost-treasures-of-the-silk-road-2013
2,Lost & Turnt Out,2015,lost-turnt-out-2015
3,Lost Voyage,2001,lost-voyage-2001
4,Lot Lizard,2016,lot-lizard-2016


(4450, 3)

In [21]:
save_path = r'../data/reelgood_url_3.csv'
url_3_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.2 mb'

In [22]:
# batch 4

t0 = time.time()
scraper = RS.ReelgoodScraper()
scraper.log_in()

scraper.current_offset = 13550
url_4_ls = scraper.get_all_urls()
scraper.close()

t1 = time.time()
print(f'time: {(t1-t0)/60:.2f} mins')

<selenium.webdriver.firefox.webdriver.WebDriver (session="78bf5033-49f1-ea4f-871d-194a511a12d6")>
https://reelgood.com/?login=true
offset: 14000
offset: 14500
offset: 15000
offset: 15500
offset: 16000
offset: 16500
offset: 17000
offset: 17500
offset: 18000
offset: 18500
time: 57.35 mins


In [23]:
url_4_df = PD.DataFrame(url_4_ls)
url_4_df.head()
url_4_df.shape

Unnamed: 0,title,year,reelgood_id
0,Struggle: The Life and Lost Art of Szukalski,2018,struggle-the-life-and-lost-art-of-szukalski-2018
1,Struggle Through Death,1981,struggle-through-death-1981
2,Stuck Between Stations,2011,stuck-between-stations-2011
3,Stuck in Love,2012,stuck-in-love-2012
4,Stuck on You!,1982,stuck-on-you-1982


(5096, 3)

In [24]:
save_path = r'../data/reelgood_url_4.csv'
url_4_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.25 mb'

In [25]:
# combine batches

url_1_df = PD.read_csv('../data/reelgood_url_1.csv')
url_2_df = PD.read_csv('../data/reelgood_url_2.csv')
url_3_df = PD.read_csv('../data/reelgood_url_3.csv')
url_4_df = PD.read_csv('../data/reelgood_url_4.csv')

In [26]:
url_df = PD.concat([url_1_df, url_2_df, url_3_df, url_4_df])
url_df.head()
url_df.shape

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
1,"10,000 BC",2008,10000-bc-2008
2,1000 Rupee Note,2016,1000-rupee-note-2016
3,1000 To 1,2014,1000-to-1-2014
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,100-and-a-tshirt-a-documentary-about-zines-in-...


(18496, 3)

In [27]:
url_df[45:55]

Unnamed: 0,title,year,reelgood_id
45,1313: Haunted Frat,2011,1313-haunted-frat-2011
46,1313: Nightmare Mansion,2011,1313-nightmare-mansion-2011
47,1313: Night of the Widow,2012,1313-night-of-the-widow-2012
48,1313: UFO Invasion,2012,1313-ufo-invasion-2012
49,13 Cameras,2015,13-cameras-2015
50,13 Demons,2016,13-demons-2016
51,13 Eerie,2013,13-eerie-2013
52,13 Hours in a Warehouse,2008,13-hours-in-a-warehouse-2008
53,13 Sins,2014,13-sins-2014
54,13th,2016,13th-2016


In [28]:
url_df[url_df.duplicated()].shape

(0, 3)

In [29]:
save_path = r'../data/reelgood_url.csv'
url_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'0.85 mb'