In [2]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import time, os
from time import sleep
from random import randint
import re
import json
from fake_useragent import UserAgent
import math

## Scrape imdb awards page

In [3]:
ua = UserAgent()
user_agent = {'User-agent': ua.random}
pages = np.arange(1980,2021)
#title_ids=[]
def get_title_ids(pages):
    imdb_ids=[]
    #Get all the raw title ids
    for page in pages:
        url = 'https://www.imdb.com/event/ev0000003/'+str(page)
        response = requests.get(url,headers=user_agent).text
        data = json.loads( re.findall(r'IMDbReactWidgets\.NomineesWidget\.push.*?(\{.*\})', response)[0] )

        #print(json.dumps(data, indent=4)) # <-- comment this out to print all data

        for award in data['nomineesWidgetModel']['eventEditionSummary']['awards']:
            if award['awardName'] != 'Oscar':
                continue
            for title in award['categories']:
                noms = title['nominations']
                for nom in noms:
                    if len(nom['secondaryNominees'])!=0:
                        id_ = nom['secondaryNominees'][0]['const'] 
                    id_2 = nom['primaryNominees'][0]['const']
                    imdb_ids.append(id_)
                    imdb_ids.append(id_2)
        sleep(randint(2,10))
        
    #change the titleId into a set
    imdb_ids_set = set(imdb_ids)
    imdb_film_ids = []
    
    #Set Contains id for people and films extract the films only 
    for film_id in imdb_ids_set:
        if film_id[0]=='t':
            imdb_film_ids.append(film_id)
    return imdb_film_ids
    

## Scrape from IMDB Title Film Page

In [4]:
def get_movie_crew_and_genre(soup,field_name):
    '''Function to grab films writers,genres,directors,and '''
    movie_crew_list = []
    for a in soup.find('h4', text=re.compile(field_name)).parent.find_all('a'):
        movie_crew_list.append(a.text.strip())
    return movie_crew_list

In [5]:
def get_movie_value(soup, field_name):
    
    '''Grab a value from Box Office Mojo HTML
    
    Takes a string attribute of a movie on the page and returns the string in
    the next sibling object (the value for that attribute) or None if nothing is found.
    '''
    
    obj = soup.find(text=re.compile(field_name))
    
    if not obj: 
        return None
    
    # this works for most of the values
    next_element = obj.findNext()
    
    if next_element:
        return next_element.text 
    else:
        return None

In [6]:
def get_distribution_company(soup):
    usa_distribution = ''
    usa_distribution_and_release = []
    distribution_company = soup.find('h4', {'id':'distributors'}).parent.find_all('li')
    for country_distro in distribution_company:
        if 'USA' in country_distro.text:
            usa_distribution = country_distro.text
            break
    usa_distribution_and_release = usa_distribution.split('(')
    usa_distribution = usa_distribution_and_release[0].replace('\n','').strip()
    return usa_distribution

In [7]:
def get_movie_dict(titleId):
    '''
    Grab the titleId 
    '''
    
    url = 'https://www.imdb.com/title/'+str(titleId)
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")
    
    title_div = soup.find('div',class_='title_wrapper')
    title_div

    IMDB_id = titleId
    
    #title 
    title = get_movie_value(soup,'title')
    title = title.split('(')[0].strip()

    #Get the directors for the movie
    directors = get_movie_crew_and_genre(soup,'Director')


    #get the writers for the movie
    writers = get_movie_crew_and_genre(soup,'Writer')

    #get the cast of the movie:
    casts = get_movie_crew_and_genre(soup,'Stars')
    casts = casts[0:3]

    #genre
    genres = get_movie_crew_and_genre(soup,'Genres')

    #country 
    country = get_movie_crew_and_genre(soup,'Country')
    country = country[0]

    #language
    language = get_movie_crew_and_genre(soup,'Language')
    language = language[0]

    #runtime
    runtime = title_div.find('time').text
    runtime = "".join(time.strip() for time in runtime.split("\n"))
#     try:
#         converted_runtime = int(runtime.split()[0])
#     except:
#         converted_runtime = 0

    #rating
    release_and_rating_string = title_div.find('div',{'class':'subtext'}).text
    rating = release_and_rating_string.split('|')[0].strip()
    rating

    #Release Date
    release_and_rating_string = title_div.find('div',{'class':'subtext'}).text
    try:
        release = release_and_rating_string.split('|')[3].strip()
        release = release.split('(')
        release = release[0]
    except:
        release = math.nan
    #converted_release_date = to_date(release_date)

    #Get budget
    try:
        budget = soup.find('h4', text=re.compile('Budget')).parent.text
        budget = budget.split('\n')
        budget = budget[1].split('$')
        budget = budget[1].replace(',','')
        budget = int(budget) 
    except:
        budget = math.nan


    #metacritic score
    try:
        metacritic_score = soup.find('div',{'class':'titleReviewBar'}).span.text
    except:
         metacritic_score = math.nan
    try:
        metacritic_score = int(metacritic_score)
    except:
        metacritic_score = math.nan
    
    headers = ['IMDBId','movie title','director(s)','writer(s)','casts','genre(s)','language','country',
           'runtime (mins)', 'mpaarating', 'release date','metacritic score','budget']

    movie_data = []
    movie_dict = dict(zip(headers, [IMDB_id,
                                    title,
                                    directors,
                                    writers,
                                    casts,
                                    genres,
                                    language,
                                    country,
                                    runtime,
                                    rating,
                                    release,
                                    metacritic_score,
                                    budget]))

    #movie_data.append(movie_dict)
    sleep(randint(2,10))
    return movie_dict

In [8]:
def get_distribution_dict(titleId):
    
    url = 'https://www.imdb.com/title/'+str(titleId)+'/companycredits'
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, "lxml")

    IMDB_id = titleId
    distribution_company = get_distribution_company(soup)

    headers = ['IMDBId','distributionCompany']

    #distribution_data = []
    distribution_dict = dict(zip(headers, [IMDB_id,
                                    distribution_company]))

    #distribution_data.append(distribution_dict)
    sleep(randint(2,10))
    return distribution_dict

In [10]:
links = get_title_ids(pages)

In [11]:
len(links)

2203

In [16]:
distribution_list = []
oscar_movie_list = []
movie_link_not_working = []
distro_link_not_working = []
count=0
count_to_10 = 0
for link in links:
    print(count)
    count+=1
    try:
        get_movie_dict(link)
        oscar_movie_list.append(get_movie_dict(link))
    except:
        movie_link_not_working.append(link)
    try:
        get_distribution_dict(link)
        distribution_list.append(get_distribution_dict(link))
    except:
        distro_link_not_working.append(link)
    if count_to_10 == 10: 
        movies_info_df = pd.DataFrame(oscar_movie_list)
        movies_info_df.set_index('IMDBId', inplace=True)
        distribution_info_df = pd.DataFrame(distribution_list)
        distribution_info_df.set_index('IMDBId', inplace=True)
        movies_info_df2 = pd.merge(movies_info_df, distribution_info_df, on='IMDBId', how='left')
        movies_info_df2.to_csv('oscar_movies_final.csv',mode='a',header=False,encoding='utf-8-sig')
        count_to_10 = 0
        oscar_movie_list = []
        ditribution_list = []
    count_to_10+=1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [47]:
movies_info_df = pd.DataFrame(oscar_movie_list)
movies_info_df.set_index('IMDBId', inplace=True)
movies_info_df.head()

Unnamed: 0_level_0,movie title,director(s),writer(s),casts,genre(s),language,country,runtime (mins),mpaarating,release date,metacritic score,budget
IMDBId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
tt0091763,Platoon,[Oliver Stone],[Oliver Stone],"[Charlie Sheen, Tom Berenger, Willem Dafoe]","[Drama, War]",English,USA,2h,R,6 February 1987,92.0,6000000.0
tt0082979,Reds,[Warren Beatty],"[Warren Beatty, Trevor Griffiths]","[Warren Beatty, Diane Keaton, Edward Herrmann]","[Biography, Drama, History, Romance]",English,USA,3h 15min,PG,25 December 1981,76.0,32000000.0
tt0084434,An Officer and a Gentleman,[Taylor Hackford],[Douglas Day Stewart],"[Richard Gere, Debra Winger, David Keith]","[Drama, Romance]",English,USA,2h 4min,R,13 August 1982,75.0,7500000.0
tt0119360,In & Out,[Frank Oz],[Paul Rudnick],"[Kevin Kline, Joan Cusack, Tom Selleck]","[Comedy, Romance]",English,USA,1h 30min,PG-13,19 September 1997,70.0,35000000.0
tt0292542,Son of the Bride,[Juan José Campanella],"[Juan José Campanella, Fernando Castets]","[Ricardo Darín, Héctor Alterio, Norma Aleandro]","[Comedy, Drama]",Spanish,Argentina,2h 3min,R,16 August 2001,68.0,


In [48]:
movies_info_df.describe()

Unnamed: 0,metacritic score,budget
count,666.0,546.0
mean,75.86036,27498710.0
std,11.386974,30399820.0
min,28.0,180000.0
25%,68.0,8100000.0
50%,77.0,18000000.0
75%,84.0,32375000.0
max,100.0,237000000.0


Unnamed: 0_level_0,movie title,director(s),writer(s),casts,genre(s),language,country,runtime (mins),mpaarating,release date,metacritic score,budget
IMDBId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
tt0091763,False,False,False,False,False,False,False,False,False,False,False,False
tt0082979,False,False,False,False,False,False,False,False,False,False,False,False
tt0084434,False,False,False,False,False,False,False,False,False,False,False,False
tt0119360,False,False,False,False,False,False,False,False,False,False,False,False
tt0292542,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
tt0189998,False,False,False,False,False,False,False,False,False,False,False,False
tt0094608,False,False,False,False,False,False,False,False,False,False,False,False
tt0097155,False,False,False,False,False,False,False,False,False,True,True,True
tt0100157,False,False,False,False,False,False,False,False,False,False,False,False


In [45]:
len(oscar_movie_list)

362

In [54]:
#Create a distribution list data frame
distribution_info_df = pd.DataFrame(distribution_list)
distribution_info_df.set_index('IMDBId', inplace=True)
distribution_info_df

Unnamed: 0_level_0,distributionCompany
IMDBId,Unnamed: 1_level_1
tt0091763,Orion Pictures
tt0082979,Paramount Pictures
tt0084434,Paramount Pictures
tt0119360,Paramount Pictures
tt0292542,Sony Pictures Classics
...,...
tt0189998,Lions Gate Films Home Entertainment
tt0094608,Paramount Pictures
tt0097155,Pathé-Nordisk Film
tt0100157,Columbia Pictures


In [57]:
distribution_info_df.tail()

Unnamed: 0_level_0,distributionCompany
IMDBId,Unnamed: 1_level_1
tt0189998,Lions Gate Films Home Entertainment
tt0094608,Paramount Pictures
tt0097155,Pathé-Nordisk Film
tt0100157,Columbia Pictures
tt0086619,MGM/UA Entertainment Company


In [60]:
movies_info_df2 = pd.merge(movies_info_df, distribution_info_df, on='IMDBId', how='left')

movies_info_df2.shape

(797, 13)

In [63]:
movies_info_df2.to_csv('oscar_movies.csv',encoding='utf-8-sig')