## Set up

In [1]:
# Import Dependencies
import pandas as pd
import requests
import urllib.parse as UP
import yaml
import pymongo
import bs4
import re
import numpy as np
from splinter import Browser
from bs4 import BeautifulSoup as bs

In [2]:
# Set up path for chromedriver

with open("config.yml", "r") as ymlpath:
    config = yaml.safe_load(ymlpath)
    executable_path = {"executable_path": config["config-key"]}

## FOR TESTING PURPOSES: For windows, un-comment cell below:

In [3]:
# executable_path = {"executable_path": "chromedriver"}

## Scraping Box Office Mojo 

In [4]:
def movie_scraper():
    
    """Scrapes www.boxofficemojo.com for the top ten movies for 2018 based on gross box-office amount.
    Returns a list of dictionaries with year, rank, movie title, and studio"""
    
    year = str(2018)
    movie_df_list=[]
    
    # Get webpage data using requests and parse html by creating a beautiful soup object
    response = requests.get("https://www.boxofficemojo.com/yearly/chart/?view2=worldwide&yr=%s&p=.htm" % year)
    soup = bs(response.text,"html.parser")

    # Find location of necessary data in soup object
    soup_tables = soup.find_all("table")
    soup_elements = soup_tables[3].find_all("td")

    # For each td element, find and store data in a list 
    movie_data=[]

    for i in soup_elements:
        if i.find("a")!=None:
            movie_data.append(i.find("a").contents[0]) 
        elif i.find("font")!=None:
            movie_data.append(i.find("font").contents[0])
        elif i.find("b")!=None:
            movie_dataappend(i.find("b").contents[0])

    ### Clean Data:

    # Remove extraneous tags
    movie_data = [a.contents[0] if type(a)!=bs4.element.NavigableString else a for a in movie_data]

    # Strip special characters
    movie_data = [re.sub("[^A-Za-z0-9-. ]+", "", a) for a in movie_data]

    # Fill NaNs
    movie_data = [np.nan if a =="na" else a for a in movie_data]

    # Set first 6 elements as column headers
    to_df = movie_data[6:]

    # Define the column names 
    columns = ["rank","title","studio","worldwide-gross","domestic-gross","domestic-pct","overseas-gross","overseas-pct"]

    # Convert to dataframe
    nrow = int(len(to_df)/len(columns)) 
    dirty_movies_df = pd.DataFrame(np.array(to_df).reshape(nrow,8),columns=columns)

    # Remove unnecessary columns
    dirty_movies_df = dirty_movies_df.iloc[: , 0:3]
    dirty_movies_df["rank"] = dirty_movies_df["rank"].apply(int)
    movies_df = dirty_movies_df.loc[dirty_movies_df["rank"] <=10,:]
    
    # Convert dataframe to list of dictionaries
    movie_dicts = movies_df.to_dict(orient="records") 
    
    print("Movies Scraped from BoxOfficeMojo.")
    
    return (movie_dicts) 

## Scraping Billboard Music

In [5]:
def process_chart(data, year):
    
    """ Use the Python package for parsing HTML.  Calls and receives HTML as strings to process for artists."""
    
    # Create soup object to parse the html
    soup = bs(data,"html5lib")
    
    # Create a list to return
    list_albums = []

    # Inspect parsed html
    # For each article item, loop and identify tags to extract from.
    # For each entry, add a dictionary to the album list
    
    for item in soup.select("article"):
        rank = int(item.select_one(".ye-chart-item__rank").string.strip())
        title = item.select_one(".ye-chart-item__title").string.strip()
        artist = item.select_one(".ye-chart-item__artist").text.replace("\n", "")
        list_albums.append({"rank":rank, "title":title, "artist":artist," year":year})
    
    return(list_albums)

In [6]:
def album_scraper():

    """Scrapes www.billboard.com for the top ten albums for 2008-2018 based on gross box-office amount.
    Returns a list of dictionaries with year, album title, and artist name"""

    # Create a list of years we are querying data for
    year = str(2018)
    
    all_albums = []

    # For each year, use requests library to get HTML and parse contentus using process_chart function
    # Add newly created list of dictionaries for specified year to comprehensive list for all years
    url = requests.get("https://www.billboard.com/charts/year-end/"+str(year)+"/top-billboard-200-albums")
    data = url.content
    all_albums = process_chart(data,year)
    
    # Filter just the top 10 albums for each year and insert into final list of dictionaries
    album_dicts = []
    for album in all_albums:
        if (album["rank"] < 11):
            album_dicts.append(album)
            
    print("Albums Scraped from Billboard.")
    
    return(album_dicts)

## Scraping Metacritic

In [7]:
def metacritic_movie_scraper(url):

    """Scrapes given metacritic.com url for the movie review information.
    Returns a dictionary with number of user reviews, average user review, number of critic reviews, and critic score"""
    
    # Use splinter and beautiful soup to parse given url
    with Browser("chrome", **executable_path, headless=True) as browser:
        browser.visit(url)
        soup = bs(browser.html, "html.parser")

    try:
        # Find number of reviews from users and critics
        rev_count_strings = soup.find_all("span", class_="based_on")
        user_rev_count = int(rev_count_strings[1].text.split(" ")[2])
        critic_rev_count = int(rev_count_strings[0].text.split(" ")[2])

        # Find review average from users and rating score from critics
        review_soup = soup.find_all("a", class_="metascore_anchor")
        user_rev_avg = float(review_soup[1].text)
        critic_rev_score = int(review_soup[0].text)

    # If page does note have review information, population review information with None values
    except (IndexError, AttributeError):
        user_rev_count = None
        critic_rev_count = None
        user_rev_avg = None
        critic_rev_score = None    
    
    # Return dictionary of book information
    movie_dict = {"user_rev_count": user_rev_count, "user_rev_avg": user_rev_avg, "critic_rev_count": critic_rev_count, "critic_rev_score": critic_rev_score}

    return(movie_dict)

In [8]:
def metacritic_album_scraper(url):

    """Scrapes given metacritic.com url for the album review information.
    Returns a dictionary with number of user reviews, average user review, number of critic reviews, and critic score"""
    
    # Use splinter and beautiful soup to parse given url   
    with Browser("chrome", **executable_path, headless=True) as browser:
        browser.visit(url)
        soup = bs(browser.html, "html.parser")
    
    try:
        # Find review average from users and rating score from critics
        review_soup = soup.find_all("a", class_="metascore_anchor")
        user_rev_avg = float(review_soup[1].text)
        critic_rev_score = int(review_soup[0].text)

        # Find number of user reviews
        count_soup = soup.find("div",class_="module reviews_module user_reviews_module")
        user_rev_count_string = count_soup.find("span",class_="count")
        user_rev_count = int(user_rev_count_string.text)

        # Find number of critic reviews
        critic_rev_count_string = count_soup.find("span",class_="count")
        critic_rev_count = int(critic_rev_count_string.text)
    
    # If page does note have review information, population review information with None values
    except (IndexError, AttributeError) :
        user_rev_count = None
        critic_rev_count = None
        user_rev_avg = None
        critic_rev_score = None

    # Return dictionary of album information
    album_dict = {"user_rev_count": user_rev_count, "user_rev_avg": user_rev_avg, "critic_rev_count": critic_rev_count, "critic_rev_score": critic_rev_score}
  
    return (album_dict)

In [9]:
def make_url_string(string):
    """Takes a string and returns a string to be inserted in url"""
    
    url_string = string.replace("(", "").replace(")","").replace("÷", "").replace("&", "").replace("-", "").\
    replace("  ", " ").replace(" ", "-").lower()
    
    if url_string.startswith("-"):
        url_string = url_string[1:]
    
    if url_string.endswith("-"):
        url_string = url_string[: -1]

    return(url_string)

## Create list of dictionaries for top movies and music

In [10]:
# Scrape BoxOfficeMojo and Billboard Music for a list of dictionaries of the top 10 movies for 2008-2018
movie_BOM_dicts = movie_scraper()
album_Bill_dicts = album_scraper()

Movies Scraped from BoxOfficeMojo.
Albums Scraped from Billboard.


In [11]:
# Add review information from Metacritic to new list of dictionaries for top movies
movie_dicts = []
for movie in movie_BOM_dicts:
    
    # Create query url from dictionary values
    movie_query = make_url_string(movie["title"])
    movie_url = f"https://www.metacritic.com/movie/{movie_query}/details"

    # Add review information to dictionary
    movie_dicts.append({**movie, **metacritic_movie_scraper(movie_url)})
    print(f"{movie['title']} scraped")
    
# Add review information from Metacritic to new list of dictionaries for top music albums
album_dicts = []
for album in album_Bill_dicts:
    # Create query url from dictionary values
    title_query = make_url_string(album["title"])
    artist_query = make_url_string(album["artist"])
    album_url = f"https://www.metacritic.com/music/{title_query}/{artist_query}"
    
    # Add review information to dictionary
    album_dicts.append({**album, **metacritic_album_scraper(album_url)})
    print(f"{album['title']} scraped")

Avengers Infinity War scraped
Black Panther scraped
Jurassic World Fallen Kingdom scraped
Incredibles 2 scraped
Aquaman scraped
Bohemian Rhapsody scraped
Venom 2018 scraped
Mission Impossible - Fallout scraped
Deadpool 2 scraped
Fantastic Beasts The Crimes of Grindelwald scraped
reputation scraped
Scorpion scraped
beerbongs & bentleys scraped
The Greatest Showman scraped
÷ (Divide) scraped
Invasion Of Privacy scraped
ASTROWORLD scraped
Stoney scraped
? scraped
Culture II scraped


## Populate mongo database

In [12]:
# Connect to mongo using pymongo to create local database
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)

# Create Top 10 database
db = client.top_10_db

# Create movies and albums collections
movies = db.movies
albums = db.albums

# Insert top 10 movies and albums for 2008-2018
# GRETEL - FIGURE OUT WHETHER WE WANT TO UPSERT
db.movies.insert_many(movie_dicts)
db.albums.insert_many(album_dicts)

<pymongo.results.InsertManyResult at 0x11b2f39c8>

## Testing