In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import time, os
from time import sleep
from random import randint
import re
import json
from fake_useragent import UserAgent
import math

## Scrape imdb awards page

In [2]:
ua = UserAgent()
user_agent = {'User-agent': ua.random}
pages = np.arange(1980,2021)
#title_ids=[]

d = dict()
def get_title_ids(pages):
    noms_list = []
    imdb_ids=[]
    
    award_dict = {}
    #Get all the raw title ids
    for page in pages:
        url = 'https://www.imdb.com/event/ev0000003/'+str(page)
        response = requests.get(url,headers=user_agent).text
        data = json.loads( re.findall(r'IMDbReactWidgets\.NomineesWidget\.push.*?(\{.*\})', response)[0] )

        #print(json.dumps(data, indent=4)) # <-- comment this out to print all data
        award_dict = {}
        for award in data['nomineesWidgetModel']['eventEditionSummary']['awards']:
            if award['awardName'] != 'Oscar':
                continue
            for title in award['categories']:
                category_name = title['categoryName']
                award_dict[category_name] = []
                noms = title['nominations']
                for nom in noms:
                    if len(nom['secondaryNominees'])!=0:
                        id_ = nom['secondaryNominees'][0]['const'] 
                    id_2 = nom['primaryNominees'][0]['const']
                    award_dict[category_name].append(id_)
                    award_dict[category_name].append(id_2)
                    imdb_ids.append(id_)
                    imdb_ids.append(id_2)
        noms_list.append(award_dict)
        sleep(randint(2,10))
        
    #change the titleId into a set
    imdb_ids_set = set(imdb_ids)
    imdb_film_ids = []
    
    #Set Contains id for people and films extract the films only 
    for film_id in imdb_ids_set:
        if film_id[0]=='t':
            imdb_film_ids.append(film_id)
    return imdb_film_ids,noms_list
    

## Scrape from IMDB Title Film Page

In [3]:
def get_movie_crew_and_genre(soup,field_name):
    '''Function to grab films writers,genres,directors,and '''
    movie_crew_list = []
    for a in soup.find('h4', text=re.compile(field_name)).parent.find_all('a'):
        movie_crew_list.append(a.text.strip())
    return movie_crew_list

In [4]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [5]:
def get_distribution_company(soup):
    usa_distribution = ''
    usa_distribution_and_release = []
    distribution_company = soup.find('h4', {'id':'distributors'}).parent.find_all('li')
    for country_distro in distribution_company:
        if 'USA' in country_distro.text:
            usa_distribution = country_distro.text
            break
    usa_distribution_and_release = usa_distribution.split('(')
    usa_distribution = usa_distribution_and_release[0].replace('\n','').strip()
    return usa_distribution

In [6]:
def get_movie_dict(titleId):
    '''
    Grab the titleId 
    '''
    
    url = 'https://www.imdb.com/title/'+str(titleId)
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    title_div = soup.find('div',class_='title_wrapper')
    title_div

    IMDB_id = titleId
    
    #title 
    title = get_movie_value(soup,'title')
    title = title.split('(')[0].strip()

    #Get the directors for the movie
    directors = get_movie_crew_and_genre(soup,'Director')


    #get the writers for the movie
    writers = get_movie_crew_and_genre(soup,'Writer')

    #get the cast of the movie:
    casts = get_movie_crew_and_genre(soup,'Stars')
    casts = casts[0:3]

    #genre
    genres = get_movie_crew_and_genre(soup,'Genres')

    #country 
    country = get_movie_crew_and_genre(soup,'Country')
    country = country[0]

    #language
    language = get_movie_crew_and_genre(soup,'Language')
    language = language[0]

    #runtime
    runtime = title_div.find('time').text
    runtime = "".join(time.strip() for time in runtime.split("\n"))

    #rating
    release_and_rating_string = title_div.find('div',{'class':'subtext'}).text
    rating = release_and_rating_string.split('|')[0].strip()
    rating

    #Release Date
    release_and_rating_string = title_div.find('div',{'class':'subtext'}).text
    try:
        release = release_and_rating_string.split('|')[3].strip()
        release = release.split('(')
        release = release[0]
    except:
        release = math.nan
   

    #Get budget
    try:
        budget = soup.find('h4', text=re.compile('Budget')).parent.text
        budget = budget.split('\n')
        budget = budget[1].split('$')
        budget = budget[1].replace(',','')
        budget = int(budget) 
    except:
        budget = math.nan


    #metacritic score
    try:
        metacritic_score = soup.find('div',{'class':'titleReviewBar'}).span.text
    except:
         metacritic_score = math.nan
    try:
        metacritic_score = int(metacritic_score)
    except:
        metacritic_score = math.nan
    
    headers = ['IMDBId','movie title','director(s)','writer(s)','casts','genre(s)','language','country',
           'runtime (mins)', 'mpaarating', 'release date','metacritic score','budget']

    movie_data = []
    movie_dict = dict(zip(headers, [IMDB_id,
                                    title,
                                    directors,
                                    writers,
                                    casts,
                                    genres,
                                    language,
                                    country,
                                    runtime,
                                    rating,
                                    release,
                                    metacritic_score,
                                    budget]))

    #movie_data.append(movie_dict)
    sleep(randint(2,10))
    return movie_dict

In [7]:
def get_distribution_dict(titleId):
    '''
    Create a dictionary of the distribution list 
    '''
    url = 'https://www.imdb.com/title/'+str(titleId)+'/companycredits'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    IMDB_id = titleId
    distribution_company = get_distribution_company(soup)

    headers = ['IMDBId','distributionCompany']

    #distribution_data = []
    distribution_dict = dict(zip(headers, [IMDB_id,
                                    distribution_company]))

    #distribution_data.append(distribution_dict)
    sleep(randint(2,10))
    return distribution_dict

In [8]:
links,noms_list = get_title_ids(pages)

## Scrape the pages 
* Read scraped pages to CSV file

In [None]:
distribution_list = []
oscar_movie_list = []
movie_link_not_working = []
distro_link_not_working = []
count=0
count_to_10 = 0
for link in links:
    print(count)
    count+=1
    try:
        get_movie_dict(link)
        oscar_movie_list.append(get_movie_dict(link))
    except:
        movie_link_not_working.append(link)
    try:
        get_distribution_dict(link)
        distribution_list.append(get_distribution_dict(link))
    except:
        distro_link_not_working.append(link)
    if count_to_10 == 10: 
        movies_info_df = pd.DataFrame(oscar_movie_list)
        movies_info_df.set_index('IMDBId', inplace=True)
        distribution_info_df = pd.DataFrame(distribution_list)
        distribution_info_df.set_index('IMDBId', inplace=True)
        movies_info_df2 = pd.merge(movies_info_df, distribution_info_df, on='IMDBId', how='left')
        movies_info_df2.to_csv('oscar_movies_2.csv',mode='a',header=False,encoding='utf-8-sig')
        count_to_10 = 0
        oscar_movie_list = []
        ditribution_list = []
    count_to_10+=1

In [None]:
def get_award(noms_list):
    """
    Get the name of the Award a film was nominated for and merge it on the the DataFrame
    """
    count=0
    for noms_dict in noms_list:
        for info in movies_info_df['IMDBId']:
            for i in noms_dict.keys():
                if info in noms_dict[i]:
                    print(info+'-'+i)
                    award_list.append(info+'-'+i)
                    count+=1
    set(award_list)
    awards_df = pd.DataFrame(list(set(award_list)),columns =['IMDBid - Awards']) 
    awards_df[['IMDBId','Awards']] = awards_df['IMDBid - Awards'].str.split('-', n=1, expand=True)
    return awards_df

In [None]:
awards_df = get_awards(noms_list)

In [None]:
movies_info_df = pd.read_csv('oscar_movies_2.csv')

In [None]:
movies_info_df = pd.merge(movies_info_df, awards_df, on='IMDBId', how='left')

In [None]:
movies_info_df.to_csv('oscar_info_movies.csv',encoding='utf-8-sig')