In [1]:
from requests import get
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

from numpy import array
from numpy import argmax
import keras
from keras.utils import to_categorical

Using TensorFlow backend.


Following code gets data for a selected number of movies released between 1934-01-01 and 2017-12-31, ordered by popularity (ordered by IMDB)

In [20]:
actorAwards = ['Actor', 'Actor in a Supporting Role', 'Actress', 'Actress in a Supporting Role', 'Actor in a Leading Role', 
               'Actress in a Leading Role']
#Awards that can be received directly by an actor/actress

directorAwards = ['Directing'] #For this dataset that name of the director is in the 'Film' category
#Awards that can be received directly by a director

movieAwards = ['Best Motion Picture', 'Outstanding Motion Picture', 'Documentary', 'Documentary (Feature)', 
               'Documentary (Short Subject)', 'Short Subject (Cartoon)', 'Short Subject (One Reel)',
               'Short Film (Animated)', 'Short Film (Live Action)', 'Best Picture', 'Animated Feature Film'] #Movie name is in the 'name' category
#Awards that directors and actors can directly influence

#Awards like writing, screenplay, production, etc are saved under other awards as the actor and director do not
#influence them as directly (or at all) as the ones above

OscarRead = pd.read_csv('Oscardatabase.csv') #read the database
yearCSV = OscarRead["Year"] #The year the movie was released
awardCSV = OscarRead["Award"] #The name of the award received/nominated for
winnerCSV = OscarRead["Winner"] #If the entry won or not

resultsCSV = winnerCSV.copy() #Create a copy to edit
resultsCSV[resultsCSV != 1] = 0 #Original CSV has a '1' for winner, and NaN for a nomination - convert NaN to '0'

nameCSV = OscarRead["Name"] 
filmCSV = OscarRead["Film"]

win = 1
nomination = 0

def getDict(awards, result = win, nameOrFilm = nameCSV, otherAwards = False):
    """awards = list containing awards interested in.
       Returns a new nested dictionary where the outside key is 
       a year, and the inside key is the name of actor/film/director
       who received a nomination or won an award in the list
       for the years 1935-2016"""
    
    Dict = {}
    for i in range(len(yearCSV)):
        if otherAwards == False and awardCSV[i] not in awards or resultsCSV[i] != result:
            continue
        elif otherAwards == True and awardCSV[i] in awards or resultsCSV[i] != result:
            continue
        try:
            try:
                Dict[yearCSV[i] + 1][nameOrFilm[i]] += 1 #If the actor/director/movie wins multiple awards in a year
            except:
                Dict[yearCSV[i] + 1][nameOrFilm[i]] = 1 #Set the value for actor to one if the year is already in the dictionary
        except:
            Dict[yearCSV[i] + 1] = {} #if year is not in dictionary yet then create it
            Dict[yearCSV[i] + 1][nameOrFilm[i]] = 1 #set the value to one
            #Above code uses year + 1 for the key as the original database has the release year of the movie stored, 
            #not the year of the oscars
    return Dict
#Following dictionaries have data for 1935-2016
#If a movie/actor/director wins an award, it is only saved as a winner and will not have an entry for the nomination
#it received for that award
OscarActorWinners = getDict(actorAwards)
OscarActorNoms = getDict(actorAwards, nomination)
OscarDirWinners = getDict(directorAwards, nameOrFilm = filmCSV)
OscarDirNoms = getDict(directorAwards, nomination, filmCSV)
movieWinners = getDict(movieAwards)
movieNoms = getDict(movieAwards, nomination)
otherAwardWinners = getDict(actorAwards + movieAwards + directorAwards, otherAwards = True)
otherAwardNoms = getDict(actorAwards + movieAwards + directorAwards, nomination, otherAwards = True)

"""I tried adding data for 2017 and 2018 to the CSV above but it broke the code. The following code simply
reads from a new CSV containing this data and adds to the dictionaries above to make the total range 1935-2018"""

OscarRead2 = pd.read_csv('OscarDataContd.csv', encoding = 'utf-8')
yearCSV2 = OscarRead2["Year"] 
awardCSV2 = OscarRead2["Award"]
winnerCSV2 = OscarRead2["Winner"]

resultsCSV2 = winnerCSV2.copy()
resultsCSV2[resultsCSV2 != 1] = 0

nameCSV2 = OscarRead2["Name"]
filmCSV2 = OscarRead2["Film"]

def updateDict(awards, Dict, result = win, nameOrFilm = nameCSV2, otherAwards = False):
    """Essentially the same as getDict above however this updates a dictionary
       that is passed to the function"""
    
    for i in range(len(yearCSV2)):
        if otherAwards == False and awardCSV2[i] not in awards or resultsCSV2[i] != result:
            continue
        elif otherAwards == True and awardCSV2[i] in awards or resultsCSV2[i] != result:
            continue
        try:
            try:
                Dict[yearCSV2[i] + 1][nameOrFilm[i]] += 1 #If the actor/director/movie wins multiple awards in a year
            except:
                Dict[yearCSV2[i] + 1][nameOrFilm[i]] = 1 #Set the value for actor to one if the year is already in the dictionary
        except:
            Dict[yearCSV2[i] + 1] = {} #if year is not in dictionary yet then create it
            Dict[yearCSV2[i] + 1][nameOrFilm[i]] = 1 #set the value to one

#Update dictionaries to contain data for 1935-2018
updateDict(actorAwards, OscarActorWinners)
updateDict(actorAwards, OscarActorNoms, nomination)
updateDict(directorAwards, OscarDirWinners, nameOrFilm = filmCSV2)
updateDict(directorAwards, OscarDirNoms, nomination, nameOrFilm = filmCSV2)
updateDict(movieAwards, movieWinners)
updateDict(movieAwards, movieNoms, nomination)
updateDict(actorAwards + movieAwards + directorAwards, otherAwardWinners, otherAwards = True)
updateDict(actorAwards + movieAwards + directorAwards, otherAwardNoms, nomination, otherAwards = True)

print("1950 oscar winners (acting) are:", OscarActorWinners[1950])
print("1950 oscar nominees (acting) are:", OscarActorNoms[1950])
print("1950 oscar winners (directing) are:", OscarDirWinners[1950])
print("1950 oscar nominees (directing) are:", OscarDirNoms[1950])
print("1950 oscar winners (movie) are:", movieWinners[1950])
print("1950 oscar nominees (movie) are:", movieNoms[1950])
print("1950 oscar winners (other) are:", otherAwardWinners[1950])
print("1950 oscar nominees (other) are:", otherAwardNoms[1950])

otherAwardNoms[2018]

1950 oscar winners (acting) are: {'Broderick Crawford': 1, 'Dean Jagger': 1, 'Olivia de Havilland': 1, 'Mercedes McCambridge': 1}
1950 oscar nominees (acting) are: {'Kirk Douglas': 1, 'Gregory Peck': 1, 'Richard Todd': 1, 'John Wayne': 1, 'John Ireland': 1, 'Arthur Kennedy': 1, 'Ralph Richardson': 1, 'James Whitmore': 1, 'Jeanne Crain': 1, 'Susan Hayward': 1, 'Deborah Kerr': 1, 'Loretta Young': 1, 'Ethel Barrymore': 1, 'Celeste Holm': 1, 'Elsa Lanchester': 1, 'Ethel Waters': 1}
1950 oscar winners (directing) are: {'Joseph L. Mankiewicz': 1}
1950 oscar nominees (directing) are: {'Robert Rossen': 1, 'William A. Wellman': 1, 'Carol Reed': 1, 'William Wyler': 1}
1950 oscar winners (movie) are: {'Daybreak in Udi ': 1, 'A Chance to Live ': 1, 'So Much for So Little ': 1, "All the King's Men ": 1, 'For Scent-Imental Reasons ': 1, 'Aquatic House-Party ': 1}
1950 oscar nominees (movie) are: {'Kenji Comes Home ': 1, '1848': 1, 'The Rising Tide ': 1, 'Battleground ': 1, 'The Heiress ': 1, 'A Lett

{'Mudbound': 3,
 'Call Me by Your Name': 1,
 'Marshall': 1,
 'The Greatest Showman': 1,
 'Star Wars: The Last Jedi': 4,
 'Dunkirk': 3,
 'Phantom Thread': 1,
 'Three Billboards Outside Ebbing, Missouri': 3,
 'Darkest Hour': 3,
 'The Shape of Water': 6,
 'The Big Sick': 1,
 'Lady Bird': 1,
 'The Disaster Artist': 1,
 'Logan': 1,
 "Molly's Game": 1,
 'DeKalb Elementary': 1,
 "The Eleven O'Clock": 1,
 'My Nephew Emmett': 1,
 'Watu Wote/All of Us': 1,
 'Baby Driver': 3,
 'I, Tonya': 1,
 'Guardians of the Galaxy Vol. 2': 1,
 'Kong: Skull Island': 1,
 'War for the Planet of the Apes': 1,
 'Garden Party': 1,
 'Lou': 1,
 'Negative Space': 1,
 'Revolting Rhymes': 1,
 'The Insult': 1,
 'Loveless': 1,
 'On Body and Soul': 1,
 'The Square': 1,
 'Beauty and the Beast': 2,
 'Blade Runner 2049': 3,
 'Victoria and Abdul': 2,
 'Wonder': 1}

In [21]:
startYear = 1934 + 1
endYear = 2018
def getRunningTotal(Dict):
    """Returns a dictionary that contains the amount
    of awards a actor/director has received at a given time"""
    totalDict = {}
    totalDict[startYear] = Dict[startYear]
            
    for year in range(startYear + 1, endYear + 1):
        totalDict[year] = {}
        totalDict[year].update(totalDict[year - 1]) #update dictionary with previous year to keep a running total each year
    
        for key, value in Dict[year].items():
            try:
                totalDict[year][key] += 1 #if the actor is already present increment by 1
            except:
                totalDict[year][key] = 1 #set it to 1 if not already present
    return totalDict

OscarActorWinnersTot = getRunningTotal(OscarActorWinners)
OscarActorNomsTot = getRunningTotal(OscarActorNoms)
OscarDirWinnersTot = getRunningTotal(OscarDirWinners)
OscarDirNomsTot = getRunningTotal(OscarDirNoms)

print(OscarActorWinnersTot[1953]['Anthony Quinn'])
print(OscarActorWinnersTot[1957]['Anthony Quinn']) #Anthony won an Oscar in 1953 and 1957

1
2


In [22]:
for key, value in OscarDirWinnersTot[2015].items(): #Directors that won multiple awards (just for fun lol)
    if value >= 2:
        print(key)
        print(value)
        
for key, value in OscarActorWinnersTot[2015].items(): #Actors that won multiple awards (just for fun lol)
    if value >= 3:
        print(key)
        print(value)

Frank Capra
3
John Ford
4
Leo McCarey
2
William Wyler
3
Billy Wilder
2
Elia Kazan
2
Joseph L. Mankiewicz
2
George Stevens
2
Fred Zinnemann
2
David Lean
2
Milos Forman
2
Oliver Stone
2
Clint Eastwood
2
Steven Spielberg
2
Ang Lee
2
Walter Brennan
3
Ingrid Bergman
3
Katharine Hepburn
3
Jack Nicholson
3
Meryl Streep
3


In [17]:
urlArray = []

numMovies = 1000 #numMovies must be less than 300,908, numMovies % 50 must be 0

for i in range(1, numMovies - 48, 50): #create an array of urls to iterate through
    urlArray.append('https://www.imdb.com/search/title?title_type=feature&release_date=1950-01-01,2017-12-31&start='
                    + str(i) + '&ref_=adv_nxt')

In [18]:
def cleanData(year_, genre, runtime, box_office):
    year1 = ''.join(c for c in year_ if c.isdigit()) #remove non-numeric characters from year
    
    genre1 = ''.join(c for c in genre if (c != '\n')) #original data has '\n' scattered around... remove those
    
    runtime1 = ''.join(c for c in runtime if c.isdigit()) #remove non-numeric characters from runtime
    
    box = ''.join(c for c in box_office if (c.isdigit() or c == '.')) #remove non-numeric characters, but keep the decimal
    box = round(float(box) * (10**6)) #original is in millions
    
    return int(year1), genre1, int(runtime1), box

In [19]:
def normalize(runtime, gross):
    nor_runtime = np.divide(runtime,max(runtime))
    nor_gross = np.divide(gross,max(gross))
    return nor_runtime, nor_gross

In [23]:
CPIdict = {}
CPIread = pd.read_csv('CPI.csv') #CPI.csv contains the multiplication factor to get from a given year to 2018 to adjust
                                 #for inflation
for i in range(len(CPIread["Growth"] + 1)):
     CPIdict[CPIread["Year"][i]] = CPIread["To 2018"][i]
print("Movies released in 1990 have a multiplier of", CPIdict[1990], "to their gross to adjust for inflation")

Movies released in 1990 have a multiplier of 1.92105968 to their gross to adjust for inflation


In [24]:
def inflationAdjust(year, box_office):
    return int(round(box_office * CPIdict[year])) #multiplies the gross value by the multiplication factor for the year
                                                  #it was released

In [25]:
def getOscarNum(actor1, actor2, actor3, actor4, actor5, movieName, releaseYear):
    stars = []
    stars.extend([actor1, actor2, actor3, actor4, actor5])
    starWins = starNoms = movieWins = movieNoms = otherWins = otherNoms = 0
    oscarYear = releaseYear + 1
    for star in stars:
        if star != '': #Checks how many oscar nominations and wins the directors and actors have prior to this movie
            starWins += checkDict(star, OscarDirWinnersTot, releaseYear) #
            starWins += checkDict(star, OscarActorWinnersTot, releaseYear)
            starNoms += checkDict(star, OscarDirNomsTot, releaseYear)
            starNoms += checkDict(star, OscarActorNomsTot, releaseYear)
    #Checks num of nominations/wins this movie received
    movieWins += checkDict(movieName, movieWinners, oscarYear)
    movieNoms += checkDict(movieName, movieNoms, oscarYear)
    otherWins += checkDict(movieName, otherAwardWinners, oscarYear)
    otherNoms += checkDict(movieName, otherAwardNoms, oscarYear)
    return starWins, starNoms, movieWins, movieNoms, otherWins, otherNoms
        
def checkDict(name, Dict, year):
    num = 0
    try:
        num += Dict[year][name]       
    except:
        pass
    return num

In [26]:
names = []
year = []
genres = []
runtimes = []
metascores = []
imdbRatings = []
audienceRating = []
movieGross = []
directors =[]
movieGrossInflation = []
Star1 = []
Star2 = []
Star3 = []
Star4 = []
Star5 = []
res = []

missingData = 0

for url in urlArray:
    response = get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    movie_cont = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
    
    for movie in movie_cont:
        name = movie.h3.a.text
        try:
            year_ = movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
            genre = movie.p.find('span', class_ = 'genre').text
            runtime = movie.p.find('span', class_ = 'runtime').text
            viewer_rating = movie.p.span.text
            m_score = movie.find('span', class_ = 'metascore').text
            imdb_rating = float(movie.strong.text)
            box_office = movie.find_all('span', attrs = {'name': 'nv'})[1].text
            people = movie.find_all('p', class_ = '')[1]
            people = people.find_all('a')
            director = people[0].text
            actor1 = people[1].text
            
            #This try/except block is just in case lesser known movies are missing actors (might happen when
            #numMovies is set to like 10000, and I don't think it's a big deal as long as we have one actor + director)
            try:
                actor2 = people[2].text
            except:
                actor2 = ''
            try:
                actor3 = people[3].text
            except:
                actor3 = ''
            try:
                actor4 = people[4].text
            except:
                actor4 = ''
            try:
                actor5 = people[5].text
            except:
                actor5 =''
            
            year_, genre, runtime, box_office = cleanData(year_, genre, runtime, box_office) #clean up data
            
            box_office_inflation = inflationAdjust(year_, box_office) #adjust for inflation
            
            numStarWins, numStarNoms, numMovieWins, numMovieNoms, numOtherWins, numOtherNoms = getOscarNum(actor1, actor2, actor3, actor4, actor5, name, year_)
                
            res.append((name, year_, genre, runtime, director, actor1, actor2, actor3, actor4, actor5,
                        viewer_rating, m_score, imdb_rating, box_office, box_office_inflation, numStarWins, 
                        numStarNoms, numMovieWins, numMovieNoms, numOtherWins, numOtherNoms))
        except:
            print("Missing data for:", name)
            missingData += 1
            
(names, year, genres, runtimes, directors, Star1, Star2, Star3, Star4, Star5, audienceRatings, metascores, 
imdbRatings, movieGross, movieGrossInflation, starWinsList, starNomsList, movieWinsList, movieNomsList, 
otherWinsList, otherNomsList) = zip(*res)


runtime_normal, box_normal = normalize(runtimes, movieGrossInflation)#Normalize the number data

Missing data for: Love
Missing data for: 8 Seconds
Missing data for: I'm Not Here
Missing data for: The Sandlot
Missing data for: A Clockwork Orange
Missing data for: Gone with the Wind
Missing data for: Bye Bye Birdie
Missing data for: Sabrina
Missing data for: Exposed
Missing data for: Gerald's Game
Missing data for: Contratiempo
Missing data for: Them!
Missing data for: The Wizard of Oz
Missing data for: Bright
Missing data for: Forbidden Planet
Missing data for: The Room
Missing data for: Once Upon a Time in America
Missing data for: Salò o le 120 giornate di Sodoma
Missing data for: Mine
Missing data for: The Autopsy of Jane Doe
Missing data for: The Detained
Missing data for: A Star Is Born
Missing data for: The Heiress
Missing data for: Death Proof
Missing data for: Caligola
Missing data for: The Black Cat
Missing data for: What Happened to Monday
Missing data for: Street Fighter
Missing data for: Shot Caller
Missing data for: Casablanca
Missing data for: Citizen Kane
Missing da

In [27]:
print("Number of movies missing data:", missingData)
print("Total movies with data:", numMovies - missingData)

Number of movies missing data: 67
Total movies with data: 933


This part below takes in the genre list, it creates a one-hot list to make it numerical for regression analysis. I found that there are 191 different combinations of genres (for movies with multiple genres, it was found that the list of them is alphabetical).

In [28]:
genres_encoded = []
genres = [i.replace(" ","") for i in genres]
diff_genres = list(dict.fromkeys(genres))
char_to_int = dict((c, i) for i, c in enumerate(diff_genres))
int_to_char = dict((i, c) for i, c in enumerate(diff_genres))
print(len(char_to_int))
print(len(diff_genres))

integer_encoded = [char_to_int[char] for char in diff_genres]

encoded = list()
for value in integer_encoded:
    letter = [0 for _ in range(len(diff_genres))]
    letter[value] = 1
    encoded.append(letter)

for j in genres:
    encoded_value = char_to_int[j]
    genres_encoded.append(encoded[encoded_value])
print(len(genres_encoded))

191
191
933


In [29]:
df = pd.DataFrame({'Name': names,
                  'Year': year,
                  'Genre': genres,
                  'Genres One-Hot Encoded': genres_encoded,
                  'Director': directors,
                  'Star1' : Star1,
                  'Star2' : Star2,
                  'Star3' : Star3,
                  'Star4' : Star4,
                  'Star5' :Star5,
                  'Runtime': runtimes,
                  'Rating': audienceRatings,
                  'Metascore': metascores,
                  'IMDB rating': imdbRatings,
                  'Gross': movieGross,
                  'Inflation adjusted':movieGrossInflation,
                  'Runtime Normalized': runtime_normal,
                  'Box Office Adj. Norm.': box_normal,
                  'numStarW' : starWinsList,
                  'numStarN' : starNomsList,
                  'numMovieW' : movieWinsList,
                  'numMovieN' : movieNomsList,
                  'numOtherW' : otherWinsList,
                  'numOtherN' : otherNomsList})

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 933 entries, 0 to 932
Data columns (total 24 columns):
Name                      933 non-null object
Year                      933 non-null int64
Genre                     933 non-null object
Genres One-Hot Encoded    933 non-null object
Director                  933 non-null object
Star1                     933 non-null object
Star2                     933 non-null object
Star3                     933 non-null object
Star4                     933 non-null object
Star5                     933 non-null object
Runtime                   933 non-null int64
Rating                    933 non-null object
Metascore                 933 non-null object
IMDB rating               933 non-null float64
Gross                     933 non-null int64
Inflation adjusted        933 non-null int64
Runtime Normalized        933 non-null float64
Box Office Adj. Norm.     933 non-null float64
numStarW                  933 non-null int64
numStarN               

In [31]:
df.head(numMovies)

Unnamed: 0,Name,Year,Genre,Genres One-Hot Encoded,Director,Star1,Star2,Star3,Star4,Star5,...,Gross,Inflation adjusted,Runtime Normalized,Box Office Adj. Norm.,numStarW,numStarN,numMovieW,numMovieN,numOtherW,numOtherN
0,The Fifth Element,1997,"Action,Adventure,Sci-Fi","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Luc Besson,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm,,...,63540000,99358504,0.623762,0.074337,0,1,0,0,0,0
1,Get Out,2017,"Horror,Mystery,Thriller","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Jordan Peele,Daniel Kaluuya,Allison Williams,Bradley Whitford,Catherine Keener,,...,176040000,180264960,0.514851,0.134869,0,2,0,0,1,0
2,The Greatest Showman,2017,"Biography,Drama,Musical","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Michael Gracey,Hugh Jackman,Michelle Williams,Zac Efron,Zendaya,,...,174340000,178524160,0.519802,0.133567,0,5,0,0,0,1
3,Suicide Squad,2016,"Action,Adventure,Fantasy","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",David Ayer,Will Smith,Jared Leto,Margot Robbie,Viola Davis,,...,325100000,339993587,0.608911,0.254374,1,4,0,0,1,0
4,Room,2015,"Drama,Thriller","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Lenny Abrahamson,Brie Larson,Jacob Tremblay,Sean Bridgers,Wendy Crewson,,...,14680000,15546210,0.584158,0.011631,0,0,0,0,0,1
5,Buffy the Vampire Slayer,1992,"Action,Comedy,Fantasy","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Fran Rubel Kuzui,Kristy Swanson,Donald Sutherland,Paul Reubens,Rutger Hauer,,...,16620000,29730330,0.425743,0.022243,0,0,0,0,0,0
6,Thor: Ragnarok,2017,"Action,Adventure,Comedy","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Mark Ruffalo,,...,315060000,322621440,0.643564,0.241376,2,7,0,0,0,0
7,Wonder Woman,2017,"Action,Adventure,Fantasy","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Patty Jenkins,Gal Gadot,Chris Pine,Robin Wright,Lucy Davis,,...,412560000,422461440,0.698020,0.316074,0,0,0,0,0,0
8,Scott Pilgrim vs. the World,2010,"Action,Comedy,Fantasy","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Edgar Wright,Michael Cera,Mary Elizabeth Winstead,Kieran Culkin,Anna Kendrick,,...,31490000,36247961,0.554455,0.027120,0,1,0,0,0,0
9,Guardians of the Galaxy,2014,"Action,Adventure,Comedy","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",James Gunn,Chris Pratt,Vin Diesel,Bradley Cooper,Zoe Saldana,,...,333180000,353258221,0.599010,0.264298,0,2,0,0,0,0
