In [1]:
from requests import get
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

Following code gets data for a selected number of movies released between 1950-01-01 and 2017-12-31, ordered by popularity (ordered by IMDB)

In [2]:
urlArray = []

numMovies = 1000 #numMovies must be less than 276,125, numMovies % 50 must be 0

for i in range(1, numMovies - 48, 50): #create an array of urls to iterate through
    urlArray.append('https://www.imdb.com/search/title?title_type=feature&release_date=1950-01-01,2017-12-31&start='
                    + str(i) + '&ref_=adv_nxt')

In [3]:
def cleanData(year_, genre, runtime, box_office):
    year1 = ''.join(c for c in year_ if c.isdigit()) #remove non-numeric characters from year
    
    genre1 = ''.join(c for c in genre if (c != '\n')) #original data has '\n' scattered around... remove those
    
    runtime1 = ''.join(c for c in runtime if c.isdigit()) #remove non-numeric characters from runtime
    
    box = ''.join(c for c in box_office if (c.isdigit() or c == '.')) #remove non-numeric characters, but keep the decimal
    box = round(float(box) * (10**6)) #original is in millions
    
    return int(year1), genre1, int(runtime1), box

In [4]:
def normalize(runtime, gross):
    nor_runtime = np.divide(runtime,max(runtime))
    nor_gross = np.divide(gross,max(gross))
    return nor_runtime, nor_gross

In [189]:
#ignore = ['Art Direction (Black and White)', 'Art Direction (Color)', 'Cinematography (Black and White)', 
 #         'Cinematography (Color)', 'Costume Design (Color)', 'Costume Design (Black and White)', 'Film Editing', 
  #        'Honorary Award', 'Honorary Foreign Language Film', 'Irving G. Thalberg Memorial Award', 
   #       'Music (Music Score of a Dramatic or Comedy Picture)', 'Music (Scoring of a Musical Picture)', 
    #      'Music (Song)', 'Sound Recording', 'Special Effects', 'Writing (Motion Picture Story)', 
     #     'Writing (Screenplay)', 'Writing (Story and Screenplay)', 'Special Visual Effects', 'Best Picture',
      #    'Writing (Screenplay, Based on Material from Another Medium)', 
       #   'Music (Scoring of Music, Adaptation or Treatment)', 'Sound', 'Directing', 'Foreign Language Film']
#Awards in the database that aren't directly influenced by actor or director or their data is in a 
#different format; ignore them for now



actorAwards = ['Actor', 'Actor in a Supporting Role', 'Actress', 'Actress in a Supporting Role', 'Actor in a Leading Role', 
               'Actress in a Leading Role']
#Awards that can be received directly by an actor

directorAwards = ['Directing'] #For this dataset that name of the director is in the 'Film' category
#Awards that can be received directly by a director

movieAwards = ['Best Motion Picture', 'Documentary (Feature)', 'Documentary (Short Subject)',
               'Short Film (Animated)', 'Short Film (Live Action)', 'Best Picture'] #Movie name is in the 'name' category
#Awards that directors and actors can directly influence

OscarRead = pd.read_csv('Oscardatabase.csv')
yearCSV = OscarRead["Year"] 
awardCSV = OscarRead["Award"] #The award received/nominated for
winnerCSV = OscarRead["Winner"] #If the entry won or not

resultsCSV = winner.copy()
resultsCSV[resultsCSV != 1] = 0

nameCSV = OscarRead["Name"]
filmCSV = OscarRead["Film"]

win = 1
nomination = 0

def getDict(awards, result = win, nameOrFilm = nameCSV):
    Dict = {}
    for i in range(len(yearCSV)):
        if awardCSV[i] not in awards or resultsCSV[i] != result:
            continue
        try:
            try:
                Dict[yearCSV[i]][nameOrFilm[i]] += 1 #If the actor/director wins multiple awards in a year
            except:
                Dict[yearCSV[i]][nameOrFilm[i]] = 1 #Set the value for actor to one if the year is already in the dictionary
        except:
            Dict[yearCSV[i]] = {} #if year is not in dictionary yet then create it
            Dict[yearCSV[i]][nameOrFilm[i]] = 1 #set the value to one
    return Dict

OscarActorWinners = getDict(actorAwards)
OscarActorNoms = getDict(actorAwards, nomination)
OscarDirWinners = getDict(directorAwards, nameOrFilm = filmCSV)
OscarDirNoms = getDict(directorAwards, nomination, filmCSV)
movieWinners = getDict(movieAwards)
movieNoms = getDict(movieAwards, nomination)

print("1950 oscar winners (acting) are:", OscarActorWinners[1950])
print("1950 oscar nominees (acting) are:", OscarActorNoms[1950])
print("1950 oscar winners (directing) are:", OscarDirWinners[1950])
print("1950 oscar nominees (directing) are:", OscarDirNoms[1950])
print("1950 oscar winners (movie) are:", movieWinners[1950])
print("1950 oscar nominees (movie) are:", movieNoms[1950])

1950 oscar winners (acting) are: {'José Ferrer': 1, 'George Sanders': 1, 'Judy Holliday': 1, 'Josephine Hull': 1}
1950 oscar nominees (acting) are: {'Louis Calhern': 1, 'William Holden': 1, 'James Stewart': 1, 'Spencer Tracy': 1, 'Jeff Chandler': 1, 'Edmund Gwenn': 1, 'Sam Jaffe': 1, 'Erich von Stroheim': 1, 'Anne Baxter': 1, 'Bette Davis': 1, 'Eleanor Parker': 1, 'Gloria Swanson': 1, 'Hope Emerson': 1, 'Celeste Holm': 1, 'Nancy Olson': 1, 'Thelma Ritter': 1}
1950 oscar winners (directing) are: {'Joseph L. Mankiewicz': 1}
1950 oscar nominees (directing) are: {'John Huston': 1, 'George Cukor': 1, 'Billy Wilder': 1, 'Carol Reed': 1}
1950 oscar winners (movie) are: {'All about Eve ': 1, 'The Titan: Story of Michelangelo ': 1, 'Why Korea? ': 1}
1950 oscar nominees (movie) are: {'Born Yesterday ': 1, 'Father of the Bride ': 1, "King Solomon's Mines ": 1, 'Sunset Blvd. ': 1, 'With These Hands ': 1, 'The Fight: Science against Cancer ': 1, 'The Stairs ': 1}


In [213]:
def getRunningTotal(Dict):
    totalDict = {}
    totalDict[1950] = Dict[1950]
    
    for year in range(1951, 2016):
        totalDict[year] = {}
        totalDict[year].update(totalDict[year - 1]) #update dictionary with previous year to keep a running total each year
    
        for key, value in Dict[year].items():
            try:
                totalDict[year][key] += 1 #if the actor is already present increment by 1
            except:
                totalDict[year][key] = 1 #set it to 1 if not already present
    return totalDict

OscarActorWinnersTot = getRunningTotal(OscarActorWinners)
OscarActorNomsTot = getRunningTotal(OscarActorNoms)
OscarDirWinnersTot = getRunningTotal(OscarDirWinners)
OscarDirNomsTot = getRunningTotal(OscarDirNoms)
movieWinnersTot = getRunningTotal(movieWinners)
movieNomsTot = getRunningTotal(movieNoms)

print(OscarActorWinnersTot[1952]['Anthony Quinn'])
print(OscarActorWinnersTot[1956]['Anthony Quinn']) #Anthony won an Oscar in 1952 and 1956

1
2


In [214]:
2 in movieNomsTot.values()

False

In [215]:
2 in movieWinnersTot.values()

False

Don't really need a running total for movies... will probably need the original dictionary to look up actors who were in those movies after we scrape data

In [219]:
for key, value in OscarDirWinnersTot[2015].items(): #Directors that won multiple awards (just for fun lol)
    if value >= 2:
        print(key)
        print(value)
        
for key, value in OscarActorWinnersTot[2015].items(): #Actors that won multiple awards (just for fun lol)
    if value >= 3:
        print(key)
        print(value)

George Stevens
2
Fred Zinnemann
2
David Lean
2
Milos Forman
2
Oliver Stone
2
Clint Eastwood
2
Steven Spielberg
2
Ang Lee
2
Alejandro G. Iñárritu
2
Katharine Hepburn
3
Jack Nicholson
3
Meryl Streep
3


In [5]:
CPIdict = {}
CPIread = pd.read_csv('CPI.csv') #CPI.csv contains the multiplication factor to get from one year to another to adjust
                                 #for inflation
for i in range(len(CPIread["Growth"] + 1)):
     CPIdict[CPIread["Year"][i]] = CPIread["To 2018"][i]
print("Movies released in 1990 have a multiplier of", CPIdict[1990], "to their gross to adjust for inflation")

Movies released in 1990 have a multiplier of 1.92105968 to their gross to adjust for inflation


In [6]:
def inflationAdjust(year, box_office):
    return int(round(box_office * CPIdict[year])) #multiplies the gross value by the multiplication factor for the year
                                                  #it was released

In [15]:
names = []
year = []
genres = []
runtimes = []
metascores = []
imdbRatings = []
audienceRating = []
movieGross = []
directors =[]
movieGrossInflation = []
Star1 = []
Star2 = []
Star3 = []
Star4 = []
res = []

missingData = 0

for url in urlArray:
    response = get(url)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    movie_cont = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
    
    for movie in movie_cont:
        name = movie.h3.a.text
        try:
            year_ = movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
            genre = movie.p.find('span', class_ = 'genre').text
            runtime = movie.p.find('span', class_ = 'runtime').text
            viewer_rating = movie.p.span.text
            m_score = movie.find('span', class_ = 'metascore').text
            imdb_rating = float(movie.strong.text)
            box_office = movie.find_all('span', attrs = {'name': 'nv'})[1].text
            people = movie.find_all('p', class_ = '')[1]
            people = people.find_all('a')
            director = people[0].text
            actor1 = people[1].text
            
            #This try/except block is just in case lesser known movies are missing actors (might happen when
            #numMovies is set to like 10000, and I don't think it's a big deal as long as we have one actor + director)
            try:
                actor2 = people[2].text
            except:
                actor2 = ''
            try:
                actor3 = people[3].text
            except:
                actor3 = ''
            try:
                actor4 = people[4].text
            except:
                actor4 = ''
            
            year_, genre, runtime, box_office = cleanData(year_, genre, runtime, box_office) #clean up data
            
            box_office_inflation = inflationAdjust(year_, box_office) #adjust for inflation
                
            res.append((name, year_, genre, runtime, director, actor1, actor2, actor3, actor4, 
                        viewer_rating, m_score, imdb_rating, box_office, box_office_inflation))
            
        except:
            print("Missing data for:", name)
            missingData += 1
            
(names, year, genres, runtimes, directors, Star1, Star2, Star3, Star4, audienceRatings, metascores, 
imdbRatings, movieGross, movieGrossInflation) = zip(*res)

runtime_normal, box_normal = normalize(runtimes, movieGrossInflation)#Normalize the number data

Missing data for: Love
Missing data for: 8 Seconds
Missing data for: I'm Not Here
Missing data for: The Sandlot
Missing data for: A Clockwork Orange
Missing data for: Bye Bye Birdie
Missing data for: Sabrina
Star Wars
1977
121
Missing data for: Exposed
Missing data for: Gerald's Game
Missing data for: Contratiempo
Missing data for: Them!
Missing data for: Bright
Missing data for: Forbidden Planet
Missing data for: The Room
Missing data for: Once Upon a Time in America
Missing data for: Salò o le 120 giornate di Sodoma
Missing data for: Mine
Missing data for: The Autopsy of Jane Doe
Missing data for: The Detained
Missing data for: Death Proof
Missing data for: Caligola
Missing data for: What Happened to Monday
Missing data for: Street Fighter
Missing data for: Shot Caller
Missing data for: Srpski film
Missing data for: Così fan tutte
Missing data for: Suspiria
Missing data for: The Ritual
Missing data for: Marooned
Missing data for: The Cincinnati Kid
Missing data for: Flash Gordon
Miss

In [8]:
print("Number of movies missing data:", missingData)
print("Total movies with data:", numMovies - missingData)

Number of movies missing data: 56
Total movies with data: 944


In [9]:
df = pd.DataFrame({'Name': names,
                  'Year': year,
                  'Genre': genres,
                  'Director': directors,
                  'Star1' : Star1,
                  'Star2' : Star2,
                  'Star3' : Star3,
                  'Star4' :Star4,
                  'Runtime': runtimes,
                  'Rating': audienceRatings,
                  'Metascore': metascores,
                  'IMDB rating': imdbRatings,
                  'Gross': movieGross,
                  'Inflation adjusted':movieGrossInflation,
                  'Runtime Normalized': runtime_normal,
                  'Box Office Adj. Norm.': box_normal})

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944 entries, 0 to 943
Data columns (total 16 columns):
Name                     944 non-null object
Year                     944 non-null int64
Genre                    944 non-null object
Director                 944 non-null object
Star1                    944 non-null object
Star2                    944 non-null object
Star3                    944 non-null object
Star4                    944 non-null object
Runtime                  944 non-null int64
Rating                   944 non-null object
Metascore                944 non-null object
IMDB rating              944 non-null float64
Gross                    944 non-null int64
Inflation adjusted       944 non-null int64
Runtime Normalized       944 non-null float64
Box Office Adj. Norm.    944 non-null float64
dtypes: float64(3), int64(4), object(9)
memory usage: 118.1+ KB


In [11]:
df.head(numMovies)

Unnamed: 0,Name,Year,Genre,Director,Star1,Star2,Star3,Star4,Runtime,Rating,Metascore,IMDB rating,Gross,Inflation adjusted,Runtime Normalized,Box Office Adj. Norm.
0,The Fifth Element,1997,"Action, Adventure, Sci-Fi",Luc Besson,Bruce Willis,Milla Jovovich,Gary Oldman,Ian Holm,126,PG,52,7.7,63540000,99358504,0.623762,0.074337
1,Get Out,2017,"Horror, Mystery, Thriller",Jordan Peele,Daniel Kaluuya,Allison Williams,Bradley Whitford,Catherine Keener,104,14A,84,7.7,176040000,180264960,0.514851,0.134869
2,The Greatest Showman,2017,"Biography, Drama, Musical",Michael Gracey,Hugh Jackman,Michelle Williams,Zac Efron,Zendaya,105,PG,48,7.6,174340000,178524160,0.519802,0.133567
3,Suicide Squad,2016,"Action, Adventure, Fantasy",David Ayer,Will Smith,Jared Leto,Margot Robbie,Viola Davis,123,PG,40,6.1,325100000,339993587,0.608911,0.254374
4,Room,2015,"Drama, Thriller",Lenny Abrahamson,Brie Larson,Jacob Tremblay,Sean Bridgers,Wendy Crewson,118,14A,86,8.2,14680000,15546210,0.584158,0.011631
5,Buffy the Vampire Slayer,1992,"Action, Comedy, Fantasy",Fran Rubel Kuzui,Kristy Swanson,Donald Sutherland,Paul Reubens,Rutger Hauer,86,PA,48,5.6,16620000,29730330,0.425743,0.022243
6,Thor: Ragnarok,2017,"Action, Adventure, Comedy",Taika Waititi,Chris Hemsworth,Tom Hiddleston,Cate Blanchett,Mark Ruffalo,130,PG,74,7.9,315060000,322621440,0.643564,0.241376
7,Wonder Woman,2017,"Action, Adventure, Fantasy",Patty Jenkins,Gal Gadot,Chris Pine,Robin Wright,Lucy Davis,141,PG,76,7.5,412560000,422461440,0.698020,0.316074
8,Scott Pilgrim vs. the World,2010,"Action, Comedy, Fantasy",Edgar Wright,Michael Cera,Mary Elizabeth Winstead,Kieran Culkin,Anna Kendrick,112,PG,69,7.5,31490000,36247961,0.554455,0.027120
9,Guardians of the Galaxy,2014,"Action, Adventure, Comedy",James Gunn,Chris Pratt,Vin Diesel,Bradley Cooper,Zoe Saldana,121,PG,76,8.1,333180000,353258221,0.599010,0.264298


Next code gets data for Oscar winners

In [12]:
print(max(movieGrossInflation))

1336590562
