# Scraping Rotten Tomatoes

### Scraping list of movies

In [2]:
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup as bs
from bs4.element import NavigableString, Tag

# This loop loops through pages of DVD & Streaming movies on Rotten Tomatoes, appending the movies in the page to a dataframe
i = 1
while True:
    # Visit page containing movies
    url = 'https://www.rottentomatoes.com/api/private/v2.0/browse?maxTomato=100&services=amazon%3Bhbo_go%3Bitunes%3Bnetflix_iw%3Bvudu%3Bamazon_prime%3Bfandango_now&certified&sortBy=release&type=dvd-streaming-all&page=' + str(i)
    response = requests.get(url)
    response_json = response.json()
    
    # If there are no results, we have reached the last page, so break out of the loop before appending nothing to the dataframe
    if response_json['results'] == []: break
    
    # If we are on the first iteration, create a new dataframe; otherwise append to it
    if i == 1:
        df = pd.DataFrame(response_json['results'])
    else:
        df = df.append(pd.DataFrame(response_json['results']))
        
    # Iterate counter to go to the next page in the next loop
    i += 1

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


### Loop to iterate through movies in dataframe (TBD)

In [22]:
ratings = []
release_dates = []
number_of_ratings = []
synopses = []
genres = []
directors = []
studios = []
for i in df['url'].head():
    url = 'https://www.rottentomatoes.com' + i
    response = requests.get(url)
    html = response.text
    soup = bs(html,'lxml')
    
    ratings.append(scrape_ratings(soup))
    release_dates.append(scrape_release_date(soup))
    number_of_ratings.append(scrape_ratings_count(soup))
    synopses.append(scrape_synopsis(soup))
    genres.append(scrape_genres(soup))
    directors.append(scrape_directors(soup))
    studios.append(scrape_studios(soup))

print(ratings)
print(release_dates)
print(number_of_ratings)
print(synopses)
print(genres)
print(directors)
print(studios)

[('6', '18'), ('81', '70'), ('66', '89'), ('36', '66'), ('74', '65')]
['Sep 2, 2005', 'Sep 16, 2005', 'Sep 16, 2005', 'Sep 23, 2005', 'Jan 21, 2005']
[('99', '43,542'), ('62', '5,280'), ('119', '59,913'), ('64', '16,550'), ('47', '13,322')]
['A seemingly insignificant act may cause the fabric of history to unravel in this sci-fi adventure. Charles Hatton (Ben Kingsley) owns and operates a successful firm known as Time Safari. Thanks to time travel technology developed by Hatton\'s employee Sonia Rand (Catherine McCormack), Time Safari allows big game hunters to journey back to prehistoric days and shoot living, breathing dinosaurs. Rand picks out the dinosaur in question, who is soon to die, and creates a floating walkway for the hunters, so the impact of their presence will not be felt by the land around them. But on one expedition, things go horribly wrong when a nervous hunter steps off the walkway and crushes a butterfly, a tiny act that proves to have massive consequences over the

### Scraping individual movie ratings

In [3]:
def scrape_ratings(soup):
    # Get ratings nodes
    r_nodes = soup.find_all('span', attrs = {'class': 'mop-ratings-wrap__percentage'})

    # Save ratings
    tomatometer = r_nodes[0].text.split()[0][:-1]
    audience = r_nodes[1].text.split()[0][:-1]

    return tomatometer, audience

### Scraping movie release date

In [15]:
def scrape_release_date(soup):
    t_nodes = soup.find_all('time')
    release_date = t_nodes[0].text
    return release_date

### Scraping number of ratings

In [21]:
def scrape_ratings_count(soup):
    tc_nodes = soup.find_all('small', attrs = {'class': 'mop-ratings-wrap__text--small'})
    total_count = tc_nodes[0].text.split()[0]
    
    ac_nodes = soup.find_all('strong', attrs = {'class': 'mop-ratings-wrap__text--small'})
    audience_count = ac_nodes[1].text.split()[2]
    
    return total_count, audience_count

### Scraping movie synopsis

In [6]:
def scrape_synopsis(soup):
    ms_nodes = soup.find_all('div', attrs = {'id': 'movieSynopsis'})
    movie_synopsis = ms_nodes[0].text.split('\n')[1].lstrip()
    return movie_synopsis

### Scraping genres

In [7]:
def scrape_genres(soup):
    g_nodes = soup.find_all('div', attrs = {'class': 'meta-value'})

    genres = []
    for item in g_nodes:
        for child in item.children:
            if isinstance(child, NavigableString):
                continue
            if isinstance(child, Tag):
                try:
                    if 'genres' in child['href']:
                        genres.append(child.text)
                except:
                    continue

    return genres

### Scraping directors

In [8]:
def scrape_directors(soup):
    d_nodes = soup.find_all('li', attrs = {'class': 'meta-row clearfix'})

    directors = []
    next_one = False
    for item in d_nodes:
        if next_one == True: break
        for child in item.children:
            if isinstance(child, NavigableString):
                continue
            if isinstance(child, Tag):
                if next_one == True:
                    for lchild in child.children:
                        if isinstance(lchild, NavigableString):
                            continue
                        if isinstance(lchild, Tag):
                            try:
                                directors.append(lchild.text)
                            except:
                                continue
                if 'Directed By:' in child.text:
                    next_one = True

    return directors

### Scraping studios

In [9]:
def scrape_studios(soup):
    s_nodes = soup.find_all('a', attrs = {'target': 'movie-studio'})
    movie_studio = s_nodes[0].text
    return movie_studio