# Reelgood Scraper

Workflow: 3  

Goal: Start with the list of reelgood movie urls and get the data for each movie. Split the urls into jobs. 

Result: The file ```reelgood_movie.csv``` is created.

In [1]:
import os
import time
import pandas as PD

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# can't reload script in a different folder

import sys
sys.path.append('../models')
import reelgood_scraper as RS 

## Create Scraping Jobs 

In [4]:
URL_FILE = '../data/reelgood_url.csv'
url_df = PD.read_csv(URL_FILE)
url_df.head()
url_df.info()

Unnamed: 0,title,year,reelgood_id
0,1,2013,1-2013
1,"10,000 BC",2008,10000-bc-2008
2,1000 Rupee Note,2016,1000-rupee-note-2016
3,1000 To 1,2014,1000-to-1-2014
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,100-and-a-tshirt-a-documentary-about-zines-in-...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18496 entries, 0 to 18495
Data columns (total 3 columns):
title          18496 non-null object
year           18496 non-null int64
reelgood_id    18496 non-null object
dtypes: int64(1), object(2)
memory usage: 433.6+ KB


In [5]:
url_1_df = url_df[0:3054]
url_2_df = url_df[3055:8999]
url_3_df = url_df[9000:12323]
url_4_df = url_df[12324:]

url_1_df.shape
url_2_df.shape
url_3_df.shape
url_4_df.shape

(3054, 3)

(5944, 3)

(3323, 3)

(6172, 3)

## Scrape Rotation

In [6]:
scraper = RS.ReelgoodScraper()
scraper.log_in()

<selenium.webdriver.firefox.webdriver.WebDriver (session="fcabd134-f658-7e40-8e54-aa42bf5442fd")>
https://reelgood.com/login


In [7]:
t0 = time.time()
movie_ls = []

for idx, row in url_4_df.iterrows():
    if idx % 500 == 0:
        print(f'row: {idx}')
        
    rg_id = row['reelgood_id']
    movie_dx = scraper.get_movie_data(rg_id)
    movie_ls.append(movie_dx)

scraper.close()
t1 = time.time()
print(f'time: {(t1-t0)/60/60:.2f} hrs')

row: 12500
row: 13000
row: 13500
row: 14000
row: 14500
row: 15000
row: 15500
row: 16000
row: 16500
row: 17000
row: 17500
row: 18000
time: 4.76 hrs


In [8]:
movie_df = PD.DataFrame(movie_ls)
movie_df.head()
movie_df.info()

Unnamed: 0,title,year,duration,rating,country,poster,genres,tags,imdb_score,rt_score,synopsis,services
0,Say You Will,2017,95.0,13+ (PG-13),America,https://img.reelgood.com/content/movie/2c0cb45...,Drama,High School,6.7,,A recent high school graduate cares for his mo...,"{""reelgood"": ""say-you-will-2018"", ""amazon"": ""B..."
1,Scaffolding,2018,88.0,,,https://img.reelgood.com/content/movie/25d095a...,Drama,,6.8,82%,17-year-old Asher is split between his charism...,"{""reelgood"": ""scaffolding-2018"", ""amazon"": ""B0..."
2,Scalpel,1977,95.0,18+ (R),America,https://img.reelgood.com/content/movie/c1f2206...,"Drama, Thriller, Mystery","Doctor, Medical",6.3,,A psychopathic plastic surgeon transforms a yo...,"{""reelgood"": ""scalpel-1977"", ""amazon"": ""B07KCT..."
3,Scandal Makers,2008,108.0,,,https://img.reelgood.com/content/movie/91e9ec0...,"Comedy, Drama",,7.2,,Former teen idol Nam Hyeon-soo is now in his t...,"{""reelgood"": ""speed-scandal-2008"", ""amazon"": ""..."
4,Scare Campaign,2016,76.0,,,https://img.reelgood.com/content/movie/5fb3339...,Horror,,5.8,,"Popular prank TV show, Scare Campaign, has bee...","{""reelgood"": ""scare-campaign-2016"", ""amazon"": ..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6172 entries, 0 to 6171
Data columns (total 12 columns):
title         6172 non-null object
year          6172 non-null object
duration      6048 non-null float64
rating        2690 non-null object
country       2560 non-null object
poster        6172 non-null object
genres        5972 non-null object
tags          2875 non-null object
imdb_score    5994 non-null object
rt_score      2002 non-null object
synopsis      6172 non-null object
services      6172 non-null object
dtypes: float64(1), object(11)
memory usage: 578.8+ KB


In [9]:
save_path = r'../data/reelgood_mov_4.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'3.51 mb'

## Merge Job Files

In [10]:
movie_1_df = PD.read_csv('../data/reelgood_mov_1.csv')
movie_2_df = PD.read_csv('../data/reelgood_mov_2.csv')
movie_3_df = PD.read_csv('../data/reelgood_mov_3.csv')
movie_4_df = PD.read_csv('../data/reelgood_mov_4.csv')

movie_1_df.shape
movie_2_df.shape
movie_3_df.shape
movie_4_df.shape

(3056, 12)

(5944, 12)

(3326, 12)

(6172, 12)

In [11]:
# remove duplicates



In [12]:
movie_df = PD.concat([movie_1_df, movie_2_df, movie_3_df, movie_4_df])
movie_df.head()
movie_df.info()

Unnamed: 0,title,year,duration,rating,country,poster,genres,tags,imdb_score,rt_score,synopsis,services
0,1,2013,112.0,13+ (PG-13),America,https://img.reelgood.com/content/movie/9b61df6...,"Documentary, Mystery, Thriller, Science-Fictio...","Car, Racing",8.0,,Set in the golden era of Grand Prix Racing '1'...,"{""reelgood"": ""1-2013"", ""amazon"": ""B0751Q3J8P""}"
1,"10,000 BC",2008,109.0,13+ (PG-13),South Africa,https://img.reelgood.com/content/movie/84ff615...,"Action & Adventure, Drama, Fantasy, History","Egypt, Animal, Fighting, Hunting",5.1,8%,A prehistoric epic that follows a young mammot...,"{""reelgood"": ""10000-bc-2008"", ""netflix"": ""7006..."
2,1000 Rupee Note,2016,89.0,,,https://img.reelgood.com/content/movie/08248c1...,Drama,,7.3,80%,Poor Parobudhi receives a thousand rupee note ...,"{""reelgood"": ""1000-rupee-note-2016"", ""netflix""..."
3,1000 To 1,2014,99.0,,,https://img.reelgood.com/content/movie/dcc9187...,"Drama, Biography",,6.9,,Cory Weissman is a college basketball player w...,"{""reelgood"": ""1000-to-1-2014"", ""amazon"": ""B07D..."
4,$100 and a T-Shirt: A Documentary About Zines ...,2004,51.0,,,https://img.reelgood.com/content/movie/d2a26a5...,Documentary,,6.3,,A cultural analysis of what causes zine maker...,"{""reelgood"": ""100-and-a-tshirt-a-documentary-a..."


<class 'pandas.core.frame.DataFrame'>
Int64Index: 18498 entries, 0 to 6171
Data columns (total 12 columns):
title         18498 non-null object
year          18498 non-null int64
duration      18074 non-null float64
rating        7998 non-null object
country       7158 non-null object
poster        18498 non-null object
genres        17826 non-null object
tags          8091 non-null object
imdb_score    17895 non-null float64
rt_score      5512 non-null object
synopsis      18498 non-null object
services      18498 non-null object
dtypes: float64(2), int64(1), object(9)
memory usage: 1.8+ MB


In [13]:
save_path = r'../data/reelgood_movie.csv'
movie_df.to_csv(save_path, index=False)
f'{round(os.path.getsize(save_path) /1e6, 2)} mb'

'10.43 mb'