In [1]:
import requests
from bs4 import BeautifulSoup
def get_IMDb_urls_year(url_, year):
    """
    Returns a list of IMDb urls for movies released on a particular year
    year: integer
    url: list of wikipedia urls
    """
    movies_urls = []
    website_url = requests.get(url_).text
    soup = BeautifulSoup(website_url, "lxml")
    mm = soup.body.find_all("a", {"rel": "nofollow", "class": "external text"}) 
    url = []
    try:
        if int(soup.body.find("div", {"id": "mw-normal-catlinks"}).li.a.string[:4]) == year:
            for i in mm:
                try:
                    if "imdb" in i["href"] and "news" not in i["href"] and "name" not in i["href"] and "event" not in i["href"] and "calendar" not in i["href"]:
                        url.append(i["href"])
                except:
                    print(i, "---wikipedia error 2---")
                    #url.append("missing" + url_)
            url = min(url, key=len)
        else:
            url = "not this year"
    except:
        if "2018" in url:
            for i in mm:
                try:
                    if "imdb" in i["href"]:
                        url.append(i["href"])
                except:
                    print(i, "---wikipedia error 3---")
                    pass
            url = min(url, key=len)
    return url

from multiprocessing import Pool
from functools import partial
def get_IMDb_urls_year_parallel(year, njobs=5):
    """
    Runs `get_IMDb_urls_yearr()` in parallel
    year: integer
    njobs: integer, number of url requests at the same time
    """
    website_url = requests.get("https://en.wikipedia.org/wiki/" + str(year) + "_in_film").text
    soup = BeautifulSoup(website_url, "lxml")
    tabless = soup.body.find_all("table", {"class": "wikitable sortable"})
    tables = [table for table in tabless if table.th.string == 'Opening\n']
    urls = []
    for table in tables:
        moviess = table.find_all("i")
        for movie in moviess: 
            try:
                urls.append("https://en.wikipedia.org" + movie.a["href"])
            except:
                print(movie, "---wikipedia error---")
                pass
    p = Pool(njobs)  
    get_IMDb_urls_yearr = partial(get_IMDb_urls_year, year=year)
    movies_urls_parallel = p.map(get_IMDb_urls_yearr, urls)
    p.terminate()
    p.join()
    movies_urls = []
    for i in movies_urls_parallel:
        if "imdb" in i:
            if "www" not in i:
                i = "https://www.imdb.com/title/" + i[-10:]
            if i[-1] != "/":
                i += "/"
            movies_urls.append(i)  
    return movies_urls

This is going to need some filtering... it turns out that BigML [only selected](https://blog.bigml.com/2017/02/23/predicting-the-2017-oscar-winners/) the top 50 most popular movies each year. This doesn't really fit with the *oscars_EDA.ipynb* findings (second to last cell). Let's select the top 70 for 2018, and also the top 70 for 2017 (we only have 30 movies for 2017).

In [2]:
def multipro_get_top_70_links(year, njobs):
    """
    Paralell runing of `get_top_70_titles()`
    Returns the top 70 movie IMDb urls in a particular year
    year: integer
    njobs: integer, number of url requests to run at the same time
    """
    links = get_IMDb_urls_year_parallel(year)
    # Runs 'get_top_70_links()' in paralell
    start = time.time()
    p = Pool(njobs)  
    keys_values = p.map(get_top_70_links, links)
    p.terminate()
    p.join() 
    popu_dict = {}
    for i in range(len(keys_values)):
        popu_dict[keys_values[i][0]] = keys_values[i][1]
    # Sorts by popularity and gets the top 70
    top_70_links_popu = sorted(popu_dict.items(), key=operator.itemgetter(1))[:70]
    # Returns only the urls
    top_70_links = [tuple_[0] for tuple_ in top_70_links_popu]
    print(time.time() - start)
    os.system('say "the function has run"') 
    return top_70_links
    
import os
import operator
def get_top_70_links(movie_url):
    """
    Returns the movie IMDd urls and their popularity rank 
    movie_url: list of movies urls  
    NOTE: Written to be used by multipro_get_top_70_titles, if not, a loop needs to be made
    """
    popu_dict = {}
    try:
        website_url = requests.get(movie_url).text 
        soup = BeautifulSoup(website_url, "lxml")
        infoo = soup.body.find("div", {"id": "wrapper"}).find("div", {"id": "root", "class": "redesign"})
        info = infoo.find("div", {"class": "plot_summary_wrapper"}).find("div", {"class": "titleReviewBar"})
        metascoree = info.find("div", {"class": "titleReviewBarItem"}).span.string
        if metascoree == None:
            index = 0
        else:
            index = 1
        try:
            popu = info.find_all("div", {"class": "titleReviewBarItem"})[index+1].find("div", {"class": "titleReviewBarSubItem"})
            m = str(popu.span)[str(popu.span).find("\n")+1:]
            l = m[:m.find("\n")].strip(" ")
            x = ""
            for i in l.split(","):
                x += i
            value = int(x)
        except:
            value = 100000
    except:
        value = 100000
        print(movie_url, "---there was an error with the title---")
    return [movie_url, value]

In [3]:
from imdb import IMDb
ia = IMDb()
import pandas as pd
def get_data_parallel(inputs, njobs, years=None, title=False): 
    """ 
    Gets the data to re-train our models and to predict this year's Oscars
    inputs: list of urls/titles to get the data for
    years: list with release year of each movie (only needed for title=True)
    title: =False, use urls as input
    title: =True, use titles as input
    """ 
    # Initializes most of the features
    year = [] 
    movie = [] # Movie title
    movie_id = [] # imdb ID 
    certificate = [] # PG-13 and the likes
    duration = [] # Duration in minutes
    genre = [] 
    rate = [] # User score (I think?) # TODO: Check this when I have internet
    metascore = [] # Needs scraping
    synopsis = [] # Blank, will be dropped
    votes = [] # Needs scraping 
    gross = [] # Needs scraping
    release_date = [] # Blank, we dropped it
    user_reviews = [] # Number of users to review it (Needs scraping)
    critics_reviews = [] # Number of critics to review it (idem)
    popularity = [] # Rank in popularity (idem)
    Golden_Globes_won, Golden_Globes_nominated, BAFTA_won, BAFTA_nominated, Screen_Actors_Guild_won, Screen_Actors_Guild_nominated, \
    Critics_Choice_won, Critics_Choice_nominated, Directors_Guild_won, Directors_Guild_nominated, \
    Producers_Guild_won, Producers_Guild_nominated, Art_Directors_Guild_won, Art_Directors_Guild_nominated, Writers_Guild_won, \
    Writers_Guild_nominated, Costume_Designers_Guild_won, Costume_Designers_Guild_nominated, Online_Film_Television_Association_won, \
    Online_Film_Television_Association_nominated, Online_Film_Critics_Society_won, Online_Film_Critics_Society_nominated, \
    People_Choice_won, People_Choice_nominated, London_Critics_Circle_Film_won, London_Critics_Circle_Film_nominated, \
    American_Cinema_Editors_won, American_Cinema_Editors_nominated, Hollywood_Film_won, Hollywood_Film_nominated, \
    Austin_Film_Critics_Association_won, Austin_Film_Critics_Association_nominated, Denver_Film_Critics_Society_won, \
    Denver_Film_Critics_Society_nominated, Boston_Society_of_Film_Critics_won, Boston_Society_of_Film_Critics_nominated, \
    New_York_Film_Critics_Circle_won, New_York_Film_Critics_Circle_nominated, Los_Angeles_Film_Critics_Association_won, \
    Los_Angeles_Film_Critics_Association_nominated = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], \
    [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []
    list_lists = [Golden_Globes_won, Golden_Globes_nominated, BAFTA_won, BAFTA_nominated, Screen_Actors_Guild_won, Screen_Actors_Guild_nominated, \
    Critics_Choice_won, Critics_Choice_nominated, Directors_Guild_won, Directors_Guild_nominated, \
    Producers_Guild_won, Producers_Guild_nominated, Art_Directors_Guild_won, Art_Directors_Guild_nominated, Writers_Guild_won, \
    Writers_Guild_nominated, Costume_Designers_Guild_won, Costume_Designers_Guild_nominated, Online_Film_Television_Association_won, \
    Online_Film_Television_Association_nominated, Online_Film_Critics_Society_won, Online_Film_Critics_Society_nominated, \
    People_Choice_won, People_Choice_nominated, London_Critics_Circle_Film_won, London_Critics_Circle_Film_nominated, \
    American_Cinema_Editors_won, American_Cinema_Editors_nominated, Hollywood_Film_won, Hollywood_Film_nominated, \
    Austin_Film_Critics_Association_won, Austin_Film_Critics_Association_nominated, Denver_Film_Critics_Society_won, \
    Denver_Film_Critics_Society_nominated, Boston_Society_of_Film_Critics_won, Boston_Society_of_Film_Critics_nominated, \
    New_York_Film_Critics_Circle_won, New_York_Film_Critics_Circle_nominated, Los_Angeles_Film_Critics_Association_won, \
    Los_Angeles_Film_Critics_Association_nominated]
    release_date_year = [] # Blank, will be dropped
    release_date_month = [] # Need it, will be used for seasons
    release_date_day_of_month = [] # Blank, will be dropped
    release_date_day_of_week = [] # Blank, will be dropped
    list_iter = [year, movie, movie_id, certificate, duration, genre, rate, metascore, synopsis, votes, gross, release_date, user_reviews, 
                 critics_reviews, popularity]
    for i in list_lists:
        list_iter.append(i)
    list_iter.append(release_date_year)
    list_iter.append(release_date_month)
    list_iter.append(release_date_day_of_month)
    list_iter.append(release_date_day_of_week)
    
    # Runs `get_data()` in parallel
    if title == True:
        p = Pool(njobs)  
        get_data_from_title = partial(get_data, title=True)
        inputss = []
        for i in range(len(inputs)):
            inputss.append([inputs[i], years[i]])
        movies_urls_parallel = p.map(get_data_from_title, inputss) #!!!!
        p.terminate()
        p.join() 
    else:
        p = Pool(njobs)
        movies_urls_parallel = p.map(get_data, inputs)
        p.terminate()
        p.join() 
    # Appends the results to the lists
    for moviee in movies_urls_parallel:
        for i in range(len(moviee)):
            list_iter[i].append(moviee[i])
                
    # Turns the data into a pandas DataFrame and applies some final manipulation
    movies = pd.DataFrame({"year": year,
                            "movie": movie,
                            "movie_id": movie_id,
                            "certificate": certificate,
                            "duration": duration,
                            "genre" : genre, 
                            "rate" : rate, 
                            "metascore": metascore,
                            "synopsis": synopsis,
                            "votes": votes,
                            "gross": gross,
                            "release_date": release_date,
                            "user_reviews": user_reviews,
                            "critics_reviews": critics_reviews,
                            "popularity": popularity,
                            "Golden_Globes_won": Golden_Globes_won,
                            "Golden_Globes_nominated": Golden_Globes_nominated,
                            "BAFTA_won": BAFTA_won,
                            "BAFTA_nominated": BAFTA_nominated,
                            "Screen_Actors_Guild_won" : Screen_Actors_Guild_won, 
                            "Screen_Actors_Guild_nominated" : Screen_Actors_Guild_nominated, 
                            "Critics_Choice_won": Critics_Choice_won,
                            "Critics_Choice_nominated": Critics_Choice_nominated,
                            "Directors_Guild_won": Directors_Guild_won,
                            "Directors_Guild_nominated": Directors_Guild_nominated,
                            "Producers_Guild_won": Producers_Guild_won,
                            "Producers_Guild_nominated": Producers_Guild_nominated,
                            "Art_Directors_Guild_won": Art_Directors_Guild_won,
                            "Art_Directors_Guild_nominated": Art_Directors_Guild_nominated,
                            "Writers_Guild_won": Writers_Guild_won,
                            "Writers_Guild_nominated": Writers_Guild_nominated,
                            "Costume_Designers_Guild_won": Costume_Designers_Guild_won,
                            "Costume_Designers_Guild_nominated": Costume_Designers_Guild_nominated,
                            "Online_Film_Television_Association_won" : Online_Film_Television_Association_won, 
                            "Online_Film_Television_Association_nominated" : Online_Film_Television_Association_nominated, 
                            "Online_Film_Critics_Society_won": Online_Film_Critics_Society_won,
                            "Online_Film_Critics_Society_nominated": Online_Film_Critics_Society_nominated,
                            "People_Choice_won": People_Choice_won,
                            "People_Choice_nominated": People_Choice_nominated,
                            "London_Critics_Circle_Film_won": London_Critics_Circle_Film_won,
                            "London_Critics_Circle_Film_nominated": London_Critics_Circle_Film_nominated,
                            "American_Cinema_Editors_won": American_Cinema_Editors_won,
                            "American_Cinema_Editors_nominated": American_Cinema_Editors_nominated,
                            "Hollywood_Film_won": Hollywood_Film_won,
                            "Hollywood_Film_nominated": Hollywood_Film_nominated,
                            "Austin_Film_Critics_Association_won": Austin_Film_Critics_Association_won,
                            "Austin_Film_Critics_Association_nominated": Austin_Film_Critics_Association_nominated,
                            "Denver_Film_Critics_Society_won" : Denver_Film_Critics_Society_won, 
                            "Denver_Film_Critics_Society_nominated" : Denver_Film_Critics_Society_nominated, 
                            "Boston_Society_of_Film_Critics_won": Boston_Society_of_Film_Critics_won,
                            "Boston_Society_of_Film_Critics_nominated": Boston_Society_of_Film_Critics_nominated,
                            "New_York_Film_Critics_Circle_won": New_York_Film_Critics_Circle_won,
                            "New_York_Film_Critics_Circle_nominated": New_York_Film_Critics_Circle_nominated,
                            "Los_Angeles_Film_Critics_Association_won": Los_Angeles_Film_Critics_Association_won,
                            "Los_Angeles_Film_Critics_Association_nominated": Los_Angeles_Film_Critics_Association_nominated,
                            "release_date.year": release_date_year,
                            "release_date.month": release_date_month,
                            "release_date.day-of-month": release_date_day_of_month,
                            "release_date.day-of-week": release_date_day_of_week})
    movies.index = movies["year"]
    movies.drop("year", axis=1, inplace=True)
    awards_noms = []
    for i in range(movies.shape[0]):
        awards_nomss = 0
        for j in movies.columns:
            if "_nominated" in j:
                awards_nomss += movies.iloc[i][j]
        awards_noms.append(awards_nomss)
    movies["awards_nominations"] = awards_noms
    awards_wins = []
    for i in range(movies.shape[0]):
        awards_winss = 0
        for j in movies.columns:
            if "_won" in j:
                awards_winss += movies.iloc[i][j]
        awards_wins.append(awards_winss)
    movies["awards_wins"] = awards_wins
    first = list(movies.columns)[:14]
    second = ["awards_wins", "awards_nominations"]
    third = list(movies.columns)[14:-2]
    movies = movies[first + second + third]
    return movies
    
def get_gross(c):
    x = str(c)[str(c).find("$")+1:]
    l = x[:x.find(" ")-1].split(",")
    m = ""
    for i in l:
        m += i
    return int(m)
def won_nom_count(awards_dict, award_title):
    won = awards_dict[award_title]["win"]
    nom = awards_dict[award_title]["nom"]
    return (won, nom)

def get_data(input_, years=None, title=False):
    
    # Using the Python API
    ia = IMDb()
    if title == True:
        iinput_, yearr = input_[0], input_[1]
        s_result = ia.search_movie(iinput_)
        for result in s_result:
            movie_info = ia.get_movie(result.movieID)
            try:
                if movie_info["year"] == yearr:
                    break
            except:
                print(iinput_, yearr, "---imdb error---")
    else:
        movie_info = ia.get_movie(input_[input_.find("/tt")+3:-1])
    try:
        year = movie_info["year"]
    except:
        year = "?"
    try:
        movie = movie_info["title"]
    except:
        movie = "?"
    try:
        movie_id = "tt" + str(movie_info.movieID)
    except:
        movie_id = "?"
    try:
        certificate = movie_info["certificates"][-1][14:]
    except:
        certificate = "?"
    try:
        duration = int(movie_info["runtimes"][0])
    except:
        duration = "?"
    genres = ""
    for i in movie_info["genre"]: # When a movie has more than one genre (the usual)
        genres = genres + "|" + i
    try:
        genre = genres[1:]
    except:
        genre = "?"
    try:
        rate = movie_info["rating"]
    except:
        rate = "?"
    synopsis = ""
    try:
        votes = movie_info["votes"]
    except:
        votes = "?"
    release_date = ""
    release_date_year = ""
    try:
        month_short = movie_info["original air date"].split(" ")[1]
        if month_short == "Dec":
            release_date_month = 12
        elif month_short == "Nov":
            release_date_month = 11
        elif month_short == "Jan":
            release_date_month = 1
        elif month_short == "Feb":
            release_date_month = 2
        elif month_short == "Mar":
            release_date_month = 3
        elif month_short == "Apr":
            release_date_month = 4
        elif month_short == "May":
            release_date_month = 5
        elif month_short == "Jun":
            release_date_month = 6
        elif month_short == "Jul":
            release_date_month = 7
        elif month_short == "Aug":
            release_date_month = 8
        elif month_short == "Sep":
            release_date_month = 9
        else:
            release_date_month = 10
    except:
        release_date_month = "?"
    release_date_day_of_month = ""
    release_date_day_of_week = ""

    # Web-Scraping
    movie_url = "https://www.imdb.com/title/" + movie_id
    website_url = requests.get(movie_url).text
    soup = BeautifulSoup(website_url, "lxml")
    # Metascore
    infoo = soup.body.find("div", {"id": "wrapper"}).find("div", {"id": "root", "class": "redesign"})
    info = infoo.find("div", {"class": "plot_summary_wrapper"}).find("div", {"class": "titleReviewBar"})
    try:
        metascoree = info.find("div", {"class": "titleReviewBarItem"}).span.string
    except:
        metascoree = None
    if metascoree == None:
        metascore = "?"
        index = 0
    else:
        metascore = int(metascoree)
        index = 1
    # Gross
    infoo_2 = infoo.find("div", {"id": "pagecontent", "class": "pagecontent"})
    infoo_2 = infoo_2.find("div", {"id": "content-2-wide", "class": "flatland"})
    infoo_2 = infoo_2.find("div", {"id": "main_bottom", "class": "main"})
    grosss = infoo_2.find("div", {"class": "article", "id": "titleDetails"})
    grosss = grosss.find_all("div", {"class": "txt-block"})
    countt = 0
    for i in grosss:
        try:
            if i.h4.string == "Gross USA:":
                gr_USA = get_gross(i)
                gross = int(gr_USA)
                countt += 1
            #if i.h4.string == "Cumulative Worldwide Gross:":   # By inspection, the data we have seems to be for the gross in USA
            #    gr_world = get_gross(i)
        except:
            if countt == 0:
                gr_USA = "?"
                gross = gr_USA
    # Reviews
    try:
        reviewss = info.find("div", {"class": "titleReviewBarItem titleReviewbarItemBorder"}).find_all("div")[1].span
        reviewss = info.find("div", {"class": "titleReviewBarItem titleReviewbarItemBorder"}).find_all("div")[1].span
        a = reviewss.find_all("a")[0].string[:-5]
        m = ""
        for i in a.split(","):
            m += i
        user_reviews = int(m)
        a = reviewss.find_all("a")[1].string[:-6]
        m = ""
        for i in a.split(","):
            m += i
        critics_reviews = int(m)
    except:
        user_reviews = "?"
        critics_reviews = "?"
    # Popularity
    try:
        popu = info.find_all("div", {"class": "titleReviewBarItem"})[index+1].find("div", {"class": "titleReviewBarSubItem"})
        m = str(popu.span)[str(popu.span).find("\n")+1:]
        l = m[:m.find("\n")].strip(" ")
        x = ""
        for i in l.split(","):
            x += i
        popularity = int(x)
    except:
        popularity = "?"
    # Awards info
    awards = infoo_2.find("div", {"id": "titleAwardsRanks", "class": "article highlighted"})
    Golden_Globes_wonn, Golden_Globes_nominatedd, BAFTA_wonn, BAFTA_nominatedd, Screen_Actors_Guild_wonn, Screen_Actors_Guild_nominatedd, \
    Critics_Choice_wonn, Critics_Choice_nominatedd, Directors_Guild_wonn, Directors_Guild_nominatedd, \
    Producers_Guild_wonn, Producers_Guild_nominatedd, Art_Directors_Guild_wonn, Art_Directors_Guild_nominatedd, Writers_Guild_wonn, \
    Writers_Guild_nominatedd, Costume_Designers_Guild_wonn, Costume_Designers_Guild_nominatedd, Online_Film_Television_Association_wonn, \
    Online_Film_Television_Association_nominatedd, Online_Film_Critics_Society_wonn, Online_Film_Critics_Society_nominatedd, \
    People_Choice_wonn, People_Choice_nominatedd, London_Critics_Circle_Film_wonn, London_Critics_Circle_Film_nominatedd, \
    American_Cinema_Editors_wonn, American_Cinema_Editors_nominatedd, Hollywood_Film_wonn, Hollywood_Film_nominatedd, \
    Austin_Film_Critics_Association_wonn, Austin_Film_Critics_Association_nominatedd, Denver_Film_Critics_Society_wonn, \
    Denver_Film_Critics_Society_nominatedd, Boston_Society_of_Film_Critics_wonn, Boston_Society_of_Film_Critics_nominatedd, \
    New_York_Film_Critics_Circle_wonn, New_York_Film_Critics_Circle_nominatedd, Los_Angeles_Film_Critics_Association_wonn, \
    Los_Angeles_Film_Critics_Association_nominatedd = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    try:
        awards = awards.find("span", {"class": "see-more inline"})
        website_url = requests.get("https://www.imdb.com" + awards.a["href"]).text
        soup_awards = BeautifulSoup(website_url, "lxml") 
        awards_titles = soup_awards.body.find("div", {"class": "article listo"}).find_all("h3")[1:]
        awards_tables = soup_awards.body.find("div", {"class": "article listo"}).find_all("table")
        awards_dict = {}
        for i in range(len(awards_titles)):
            awards_title_string = str(awards_titles[i])
            awards_title_stringg = awards_title_string[awards_title_string.find("\n")+1:]
            awards_title_stringgg = awards_title_stringg[:awards_title_stringg.find("\n")].strip(" ")
            winner_nominee = awards_tables[i].find_all("td", {"class": "title_award_outcome"}) 
            if awards_title_stringgg in awards_dict.keys():
                for index in range(len(winner_nominee)):
                    if winner_nominee[index].b.string == "Winner":
                        win += int(winner_nominee[index]["rowspan"])
                        nom += int(winner_nominee[index]["rowspan"])
                    else:
                        nom += int(winner_nominee[index]["rowspan"])
            else:
                win = 0
                nom = 0
                for index in range(len(winner_nominee)):
                    if winner_nominee[index].b.string == "Winner":
                        win += int(winner_nominee[index]["rowspan"])
                        nom += int(winner_nominee[index]["rowspan"])
                    else:
                        nom += int(winner_nominee[index]["rowspan"])
            win_nom_dict = {}
            win_nom_dict["win"] = win
            win_nom_dict["nom"] = nom
            awards_dict[awards_title_stringgg] = win_nom_dict
            award_title = awards_title_stringgg
            if award_title == 'Golden Globes, USA':
                Golden_Globes_wonn, Golden_Globes_nominatedd = won_nom_count(awards_dict, award_title) 
            if award_title == "BAFTA Awards":
                BAFTA_wonn, BAFTA_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == 'Screen Actors Guild Awards':
                Screen_Actors_Guild_wonn, Screen_Actors_Guild_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Broadcast Film Critics Association Awards":
                Critics_Choice_wonn, Critics_Choice_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Directors Guild of America, USA":
                Directors_Guild_wonn, Directors_Guild_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "PGA Awards":
                Producers_Guild_wonn, Producers_Guild_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Art Directors Guild":
                Art_Directors_Guild_wonn, Art_Directors_Guild_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Writers Guild of America, USA":
                Writers_Guild_wonn, Writers_Guild_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Costume Designers Guild Awards":
                Costume_Designers_Guild_wonn, Costume_Designers_Guild_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Online Film &amp; Television Association":
                Online_Film_Television_Association_wonn, Online_Film_Television_Association_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Online Film Critics Society Awards":
                Online_Film_Critics_Society_wonn, Online_Film_Critics_Society_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "People's Choice Awards, USA":
                People_Choice_wonn, People_Choice_nominatedd = won_nom_count(award_title)
            if award_title == "London Critics Circle Film Awards":
                London_Critics_Circle_Film_wonn, London_Critics_Circle_Film_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "American Cinema Editors, USA":
                American_Cinema_Editors_wonn, American_Cinema_Editors_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Hollywood Film Awards":
                Hollywood_Film_wonn, Hollywood_Film_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Austin Film Critics Association":
                Austin_Film_Critics_Association_wonn, Austin_Film_Critics_Association_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Denver Film Critics Society":
                Denver_Film_Critics_Society_wonn, Denver_Film_Critics_Society_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Boston Society of Film Critics Awards":
                Boston_Society_of_Film_Critics_wonn, Boston_Society_of_Film_Critics_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "New York Film Critics Circle Awards":
                New_York_Film_Critics_Circle_wonn, New_York_Film_Critics_Circle_nominatedd = won_nom_count(awards_dict, award_title)
            if award_title == "Los Angeles Film Critics Association Awards":
                Los_Angeles_Film_Critics_Association_wonn, Los_Angeles_Film_Critics_Association_nominatedd = won_nom_count(awards_dict, award_title)      
    except:
        pass
    return (year, movie, movie_id, certificate, duration, genre, rate, metascore, synopsis, votes, gross, release_date, user_reviews, 
            critics_reviews, popularity, Golden_Globes_wonn, Golden_Globes_nominatedd, BAFTA_wonn, BAFTA_nominatedd,
            Screen_Actors_Guild_wonn, Screen_Actors_Guild_nominatedd, Critics_Choice_wonn, Critics_Choice_nominatedd,
            Directors_Guild_wonn, Directors_Guild_nominatedd, Producers_Guild_wonn, Producers_Guild_nominatedd,
            Art_Directors_Guild_wonn, Art_Directors_Guild_nominatedd, Writers_Guild_wonn, Writers_Guild_nominatedd,
            Costume_Designers_Guild_wonn, Costume_Designers_Guild_nominatedd, Online_Film_Television_Association_wonn,
            Online_Film_Television_Association_nominatedd, Online_Film_Critics_Society_wonn, Online_Film_Critics_Society_nominatedd,
            People_Choice_wonn, People_Choice_nominatedd, London_Critics_Circle_Film_wonn, London_Critics_Circle_Film_nominatedd,
            American_Cinema_Editors_wonn, American_Cinema_Editors_nominatedd, Hollywood_Film_wonn, Hollywood_Film_nominatedd,
            Austin_Film_Critics_Association_wonn, Austin_Film_Critics_Association_nominatedd, Denver_Film_Critics_Society_wonn,
            Denver_Film_Critics_Society_nominatedd, Boston_Society_of_Film_Critics_wonn, Boston_Society_of_Film_Critics_nominatedd,
            New_York_Film_Critics_Circle_wonn, New_York_Film_Critics_Circle_nominatedd, Los_Angeles_Film_Critics_Association_wonn,
            Los_Angeles_Film_Critics_Association_nominatedd, release_date_year, release_date_month, release_date_day_of_month,
            release_date_day_of_week)

Let's get the data for this year (will need updating)

In [4]:
import time
start = time.time() 
top_70_links_2018 = multipro_get_top_70_links(2018, 10)
time.time() - start

<i>La Boda de Valentina</i> ---wikipedia error---
<i>Followers</i> ---wikipedia error---
<i>Pandas</i> ---wikipedia error---
<i>Gauguin - Voyage de Tahiti</i> ---wikipedia error---
<i>My Son</i> ---wikipedia error---
<i>Power of the Air</i> ---wikipedia error---
<i>Bullitt County</i> ---wikipedia error---
<i>Conundrum: Secrets Among Friends</i> ---wikipedia error---
https://www.imdb.com/title/tt7528086/ ---there was an error with the title---
15.814354181289673


39.08746910095215

TODO: Check if these movies are relevant... probably not

In [5]:
start = time.time()
data_2018 = get_data_parallel(top_70_links_2018, njobs = 10)
time.time() - start 

21.42363214492798

In [6]:
data_2018.to_csv("to_be_used_on_trained_model.csv") 

The Oscars categories for this year will need to be updated manually (it won't take too long using a spreadsheet)... TODO!

Let's update the data we originally had

In [7]:
import time
start = time.time()
previous_data = pd.read_csv("oscars.csv")
movies_titles = list(previous_data["movie"].values)
movies_years = list(previous_data["year"].values)
updated_data = get_data_parallel(movies_titles, 10, movies_years, title=True)
# I waited until here to get the Oscar data, because I see no way of webscraping it... :/
oscars_features = list(previous_data.columns[17:34])
for i in oscars_features:
    updated_data[i] = previous_data[i].values
first = list(updated_data.columns)[:16]
second = list(updated_data.columns)[60:]
third = list(updated_data.columns)[16:60] 
updated_data = updated_data[first + second + third]
updated_data.to_csv("train_data-up_to_date.csv")
time.time() - start

The Cabin in the Woods 2012 ---imdb error---


447.31055188179016

Only one error, it seems there is discrepancy regarding which year this movie was released? Editing the origial spreadsheet would be a quick fix though (TODO).

We need to fill up the 2017 data (only 30 movies vs the 60-70 on the other years) (TODO!)