# Scraping Rotten Tomatoes

### Scraping list of movies

In [None]:
# Imports
import requests
import pandas as pd
import time
from bs4 import BeautifulSoup as bs
from bs4.element import NavigableString, Tag
from datetime import datetime
import scraping_class

# Loop through pages of DVD & Streaming movies on Rotten Tomatoes, appending the movies in the page to a dataframe
i = 1
while True:
    # Visit page containing movies
    logfile = 'sds-19-exam.csv'
    connector = scraping_class.Connector(logfile)
    url = 'https://www.rottentomatoes.com/api/private/v2.0/browse?maxTomato=100&services=amazon%3Bhbo_go%3Bitunes%3Bnetflix_iw%3Bvudu%3Bamazon_prime%3Bfandango_now&certified&sortBy=release&type=dvd-streaming-all&page=' + str(i)
    response, my_id = connector.get(url, 'rt-scrape-list')
    response_json = response.json()
    
    # If there are no results, we have reached the last page, so break out of the loop before appending nothing to the dataframe
    if response_json['results'] == []: break
    
    # If we are on the first iteration, create a new dataframe; otherwise append to it
    if i == 1:
        df = pd.DataFrame(response_json['results'])
    else:
        df = df.append(pd.DataFrame(response_json['results']))
    
    # Store dataframe so far
    df.to_pickle('rt')
    
    # Iterate counter to go to the next page in the next loop
    print(i)
    time.sleep(1)
    i += 1

### Scraping functions

In [None]:
# Function for scraping ratings
def scrape_ratings(soup):
    # Get ratings nodes
    r_nodes = soup.find_all('span', attrs = {'class': 'mop-ratings-wrap__percentage'})

    # Save ratings
    try:
        tomatometer = r_nodes[0].text.split()[0][:-1]
    except:
        tomatometer = None
    
    try:
        audience = r_nodes[1].text.split()[0][:-1]
    except:
        audience = None

    return tomatometer, audience


# Function for scraping release dates
def scrape_release_date(soup):
    t_nodes = soup.find_all('time')
    
    try:
        release_date = t_nodes[0].text
    except:
        release_date = None
    
    return release_date


# Function for scraping numbers of ratings
def scrape_ratings_count(soup):
    tc_nodes = soup.find_all('small', attrs = {'class': 'mop-ratings-wrap__text--small'})
    
    try:
        total_count = tc_nodes[0].text.split()[0]
    except:
        total_count = None
    
    ac_nodes = soup.find_all('strong', attrs = {'class': 'mop-ratings-wrap__text--small'})
    
    try:
        audience_count = ac_nodes[1].text.split()[2]
    except:
        audience_count = None
    
    return total_count, audience_count


# Function for scraping synopses
def scrape_synopsis(soup):
    ms_nodes = soup.find_all('div', attrs = {'id': 'movieSynopsis'})
    
    try:
        movie_synopsis = ms_nodes[0].text.split('\n')[1].lstrip()
    except:
        movie_synopsis = None
    
    return movie_synopsis


# Function for scraping genres
def scrape_genres(soup):
    g_nodes = soup.find_all('div', attrs = {'class': 'meta-value'})

    genres = []
    for item in g_nodes:
        for child in item.children:
            if isinstance(child, NavigableString):
                continue
            if isinstance(child, Tag):
                try:
                    if 'genres' in child['href']:
                        genres.append(child.text)
                except:
                    continue

    return genres


# Function for scraping directors
def scrape_directors(soup):
    d_nodes = soup.find_all('li', attrs = {'class': 'meta-row clearfix'})

    directors = []
    next_one = False
    for item in d_nodes:
        if next_one == True: break
        for child in item.children:
            if isinstance(child, NavigableString):
                continue
            if isinstance(child, Tag):
                if next_one == True:
                    for lchild in child.children:
                        if isinstance(lchild, NavigableString):
                            continue
                        if isinstance(lchild, Tag):
                            try:
                                directors.append(lchild.text)
                            except:
                                continue
                if 'Directed By:' in child.text:
                    next_one = True

    return directors


# Function for scraping studios
def scrape_studios(soup):
    s_nodes = soup.find_all('a', attrs = {'target': 'movie-studio'})
    try:
        movie_studio = s_nodes[0].text
    except:
        movie_studio = None
    return movie_studio

### Loop to iterate through movies in dataframe

In [None]:
# Make lists to store info
ratings = []
release_dates = []
number_of_ratings = []
synopses = []
genres = []
directors = []
studios = []

# Loop through all movies
iteration = 1
for i in df['url']:
    # Get HTML
    logfile = 'sds19exam.csv'
    connector = scraping_class.Connector(logfile)
    url = 'https://www.rottentomatoes.com' + i
    response, my_id = connector.get(url, 'rt-scrape-movies')
    html = response.text
    soup = bs(html,'lxml')       
    
    # Try to get ratings
    try:
        ratings.append(scrape_ratings(soup))
    except:
        ratings.append(None)
    
    # Try to get release dates
    try:
        release_dates.append(scrape_release_date(soup))
    except:
        release_dates.append(None)
    
    # Try to get number of ratings
    try:
        number_of_ratings.append(scrape_ratings_count(soup))
    except:
        number_of_rating.append(None)
    
    # Try to get synopses
    try:
        synopses.append(scrape_synopsis(soup))
    except:
        synopses.append(None)
    
    # Try to get genres
    try:
        genres.append(scrape_genres(soup))
    except:
        genres.append(None)
    
    # Try to get directors
    try:
        directors.append(scrape_directors(soup))
    except:
        directors.append(None)
    
    # Try to get studios
    try:
        studios.append(scrape_studios(soup))
    except:
        studios.append(None)
    
    # Setup next iteration
    print(iteration)
    time.sleep(0.5)
    iteration += 1

# Write lists of collected data to dataframe
df['ratings'] = ratings
df['releaseDate'] = release_dates
df['numberOfRatings'] = number_of_ratings
df['synopsis'] = synopses
df['genres'] = genres
df['directors'] = directors
df['studio'] = studios

### Structuring data

In [None]:
# Function for turning date into datetime format
def get_datetime(string):
    try:
        output = datetime.strptime(string, '%b %d, %Y')
    except:
        output = None
    return output

# Function for getting the n'th element of a list
def get_nth_ele(lst, n):
    try:
        return lst[n]
    except:
        return None

# Drop useless columns
df = df.drop(['id', 'theaterReleaseDate', 'dvdReleaseDate', 'mainTrailer', 'posters', 'synopsisType', 'tomatoScore', 'url', 'runtime'], axis = 1)

# Get release_date in datetime
df['releaseDate'] = df['releaseDate'].apply(get_datetime)

# Create seperate columns
df['tomatoMeter'] = df['ratings'].apply(get_nth_ele, n = 0)
df['audienceScore'] = df['ratings'].apply(get_nth_ele, n = 1)
df['tomatoCount'] = df['numberOfRatings'].apply(get_nth_ele, n = 0)
df['audienceCount'] = df['numberOfRatings'].apply(get_nth_ele, n = 1)

# Drop columns for the second time
df = df.drop(['ratings', 'numberOfRatings'], axis = 1)

# Save dataframe
df.to_pickle('rt_clean.pkl')