# Collection of the necessary IMDB Top 250 data (movie_title, rating, votes)

In [1]:
from bs4 import BeautifulSoup
import requests
import re

# Collection of the necessary data from the IMDB website

url = 'https://www.imdb.com/chart/top/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

movies = soup.select('td.titleColumn')
links = [a.attrs.get('href') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
votes = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=nv]')]

imdb = []

# Store each item into dictionary (data), then put those into a list (imdb)
for index in range(0, len(movies)):
    
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    data = {"movie_title": movie_title,
            "rating": ratings[index],
            "vote": votes[index]}
    
    imdb.append(data)

#for item in imdb:
    #print(item['movie_title'], item['rating'], item['vote'])


In [2]:
#Storing the data in pandas dataframe

import pandas as pd

df = pd.DataFrame(imdb)
df

Unnamed: 0,movie_title,rating,vote
0,A remény rabjai,9.233462181238655,2573496
1,A keresztapa,9.155494110852384,1772098
2,A sötét lovag,8.98389598225698,2540224
3,A keresztapa II,8.983772613004994,1225437
4,Tizenkét dühös ember,8.94611456452008,760062
...,...,...,...
245,Aladdin,8.007181114246224,400245
246,A segítség,8.005020485531363,448187
247,A szépség és a szörnyeteg,8.00467107823543,439887
248,Dűne,8.00400198684381,542946


In [293]:
#Selection of the first 20 movies

n = 230

df.drop(df.tail(n).index,
        inplace = True)
df

Unnamed: 0,movie_title,rating,vote
0,A remény rabjai,9.233467025787746,2573291
1,A keresztapa,9.155498662302584,1771969
2,A sötét lovag,8.983889365547517,2539937
3,A keresztapa II,8.983779542636535,1225351
4,Tizenkét dühös ember,8.946116249530277,760017
5,Schindler listája,8.934409000303958,1310790
6,A Gyűrűk Ura: A király visszatér,8.919524476894612,1769962
7,Ponyvaregény,8.85171018432694,1975090
8,A Gyűrűk Ura: A gyűrű szövetsége,8.802738466839985,1791544
9,"A Jó, a Rossz és a Csúf",8.794602011631522,739726


In [107]:
#Exporting the movies in a csv file to translate their titles to English manually

df.to_csv('imdb_TF.csv')

In [294]:
df_2 = pd.read_csv('imdb_TF_ENG.csv')
df_2

Unnamed: 0,movie_title,rating,vote
0,The Shawshank Redemption,9.233466,2573229
1,The Godfather,9.155501,1771940
2,The Dark Knight,8.983874,2539863
3,The Godfather Part II,8.983774,1225326
4,12 Angry Men,8.946108,760006
5,Schindler's List,8.934416,1310769
6,The Lord of the Rings: The Return of the King,8.919516,1769931
7,Pulp Fiction,8.851711,1975054
8,The Lord of the Rings: The Fellowship of the Ring,8.802732,1791516
9,"The Good, the Bad and the Ugly",8.794596,739715


In [109]:
#Extration of the movie titles in a list

titles = df_2['movie_title'].tolist()
titles

['The Shawshank Redemption',
 'The Godfather',
 'The Dark Knight',
 'The Godfather Part II',
 '12 Angry Men',
 "Schindler's List",
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Good, the Bad and the Ugly',
 'Forrest Gump',
 'Fight Club',
 'Inception',
 'The Lord of the Rings: The Two Towers',
 'Star Wars: Episode V - The Empire Strikes Back',
 'The Matrix',
 'Goodfellas',
 "One Flew over the Cuckoo's Nest",
 'Seven',
 'Seven Samurai']

## Scraping oscar winning data using pandas from wikipedia 
(  https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films )

In [297]:
# pd_read_html function was applied to extract tabular data from wikipedia

df_3 = pd.read_html('https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films')[0]

In [298]:
# Every oscar winning film of the history was collected in a dataframe 

df_3

Unnamed: 0,Film,Year,Awards,Nominations
0,CODA,2021,3,3
1,Dune,2021,6,10
2,The Eyes of Tammy Faye,2021,2,2
3,No Time to Die,2021,1,1
4,The Windshield Wiper,2021,1,1
...,...,...,...,...
1342,The Yankee Doodle Mouse,1943,1,1
1343,The Yearling,1946,2,7
1344,"Yesterday, Today and Tomorrow (Ieri, oggi, dom...",1964,1,1
1345,You Can't Take It with You,1938,2,7


In [299]:
# deleting  unnecessary columns

del df_3['Year']
del df_3['Nominations']

df_3.head()

Unnamed: 0,Film,Awards
0,CODA,3
1,Dune,6
2,The Eyes of Tammy Faye,2
3,No Time to Die,1
4,The Windshield Wiper,1


## Preparation of a pandas dataframe containing a column with the number of oscars won by our TOP 20 film ( the process runs from df_3 to df_8 )


In [120]:
#Checking the common oscar winning films of our TOP 20 movies and the collected using pandas .isin() function

#Oscar winning film from the TOP 20 movies:
df_4 = df_3[df_3['Film'].isin(titles)]
df_4

Unnamed: 0,Film,Awards
167,Inception,4
196,The Dark Knight,2
270,The Lord of the Rings: The Return of the King,11
284,The Lord of the Rings: The Two Towers,2
298,The Lord of the Rings: The Fellowship of the Ring,4
327,The Matrix,4
395,Forrest Gump,6
399,Pulp Fiction,1
411,Schindler's List,7
454,Goodfellas,1


In [118]:
#Extration of oscar winning film titles to a list

oscars = df_4['Film'].tolist()
oscars

['Inception',
 'The Dark Knight',
 'The Lord of the Rings: The Return of the King',
 'The Lord of the Rings: The Two Towers',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Matrix',
 'Forrest Gump',
 'Pulp Fiction',
 "Schindler's List",
 'Goodfellas',
 "One Flew over the Cuckoo's Nest",
 'The Godfather Part II',
 'The Godfather']

In [123]:
#Extration of "non oscar winning" movies from our TOP 20 movies (df_2) using pandas "not .isin()" function (~ means "not").

df_5 = df_2[~df_2['movie_title'].isin(oscars)]
df_5

Unnamed: 0,movie_title,rating,vote
0,The Shawshank Redemption,9.233466,2573229
4,12 Angry Men,8.946108,760006
9,"The Good, the Bad and the Ugly",8.794596,739715
11,Fight Club,8.75083,2025085
14,Star Wars: Episode V - The Empire Strikes Back,8.701897,1244815
18,Seven,8.603628,1578267
19,Seven Samurai,8.599554,337851


In [300]:
#Addition of a column 'oscars' with null values:

df_5['oscars'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_5['oscars'] = 0


In [302]:
df_5

Unnamed: 0,movie_title,rating,vote,oscars
0,The Shawshank Redemption,9.233466,2573229,0
4,12 Angry Men,8.946108,760006,0
9,"The Good, the Bad and the Ugly",8.794596,739715,0
11,Fight Club,8.75083,2025085,0
14,Star Wars: Episode V - The Empire Strikes Back,8.701897,1244815,0
18,Seven,8.603628,1578267,0
19,Seven Samurai,8.599554,337851,0


In [128]:
df_4

Unnamed: 0,Film,Awards
167,Inception,4
196,The Dark Knight,2
270,The Lord of the Rings: The Return of the King,11
284,The Lord of the Rings: The Two Towers,2
298,The Lord of the Rings: The Fellowship of the Ring,4
327,The Matrix,4
395,Forrest Gump,6
399,Pulp Fiction,1
411,Schindler's List,7
454,Goodfellas,1


In [304]:
# Renaming columns in the dataframe of our oscar winning movies from TOP 20 (df_4)

df_4.rename(columns = {'Film':'movie_title'}, inplace = True)
df_4.rename(columns = {'Awards':'oscars'}, inplace = True)

In [305]:
df_4

Unnamed: 0,movie_title,oscars
167,Inception,4
196,The Dark Knight,2
270,The Lord of the Rings: The Return of the King,11
284,The Lord of the Rings: The Two Towers,2
298,The Lord of the Rings: The Fellowship of the Ring,4
327,The Matrix,4
395,Forrest Gump,6
399,Pulp Fiction,1
411,Schindler's List,7
454,Goodfellas,1


In [306]:
#Addition of oscar values to our original top 20 movies (df_2) with merging the two dataframes (df_4, df_2):

df_6 = pd.merge(df_4, df_2, on="movie_title")
df_6

Unnamed: 0,movie_title,oscars,rating,vote
0,Inception,4,8.732883,2258436
1,The Dark Knight,2,8.983874,2539863
2,The Lord of the Rings: The Return of the King,11,8.919516,1769931
3,The Lord of the Rings: The Two Towers,2,8.72658,1598918
4,The Lord of the Rings: The Fellowship of the Ring,4,8.802732,1791516
5,The Matrix,4,8.671248,1851478
6,Forrest Gump,6,8.767206,1984958
7,Pulp Fiction,1,8.851711,1975054
8,Schindler's List,7,8.934416,1310769
9,Goodfellas,1,8.650625,1110342


In [307]:
#Changing column positions:

df_6 = df_6[['movie_title', 'rating', 'vote', 'oscars']]
df_6

Unnamed: 0,movie_title,rating,vote,oscars
0,Inception,8.732883,2258436,4
1,The Dark Knight,8.983874,2539863,2
2,The Lord of the Rings: The Return of the King,8.919516,1769931,11
3,The Lord of the Rings: The Two Towers,8.72658,1598918,2
4,The Lord of the Rings: The Fellowship of the Ring,8.802732,1791516,4
5,The Matrix,8.671248,1851478,4
6,Forrest Gump,8.767206,1984958,6
7,Pulp Fiction,8.851711,1975054,1
8,Schindler's List,8.934416,1310769,7
9,Goodfellas,8.650625,1110342,1


In [309]:
# Merging oscar winning and non-oscar winning movies of our TOP 20 films using the appropriate data frames (df_5 and df_6)

df_7 = df_5.append(df_6)
df_7

Unnamed: 0,movie_title,rating,vote,oscars
0,The Shawshank Redemption,9.233466,2573229,0
4,12 Angry Men,8.946108,760006,0
9,"The Good, the Bad and the Ugly",8.794596,739715,0
11,Fight Club,8.75083,2025085,0
14,Star Wars: Episode V - The Empire Strikes Back,8.701897,1244815,0
18,Seven,8.603628,1578267,0
19,Seven Samurai,8.599554,337851,0
0,Inception,8.732883,2258436,4
1,The Dark Knight,8.983874,2539863,2
2,The Lord of the Rings: The Return of the King,8.919516,1769931,11


In [311]:
# sorting the films according to rating

df_8 = df_7.sort_values(["rating"], ascending=False)
df_8

Unnamed: 0,movie_title,rating,vote,oscars
0,The Shawshank Redemption,9.233466,2573229,0
12,The Godfather,9.155501,1771940,3
1,The Dark Knight,8.983874,2539863,2
11,The Godfather Part II,8.983774,1225326,6
4,12 Angry Men,8.946108,760006,0
8,Schindler's List,8.934416,1310769,7
2,The Lord of the Rings: The Return of the King,8.919516,1769931,11
7,Pulp Fiction,8.851711,1975054,1
4,The Lord of the Rings: The Fellowship of the Ring,8.802732,1791516,4
9,"The Good, the Bad and the Ugly",8.794596,739715,0


In [322]:
df_8 = df_8.reset_index(drop=True)
df_8

Unnamed: 0,movie_title,rating,vote,oscars,vote_adjusted_rating
0,The Shawshank Redemption,9.233466,2573229,0,9.233466
1,The Godfather,9.155501,1771940,3,8.354212
2,The Dark Knight,8.983874,2539863,2,8.950508
3,The Godfather Part II,8.983774,1225326,6,7.635871
4,12 Angry Men,8.946108,760006,0,7.132885
5,Schindler's List,8.934416,1310769,7,7.671956
6,The Lord of the Rings: The Return of the King,8.919516,1769931,11,8.116218
7,Pulp Fiction,8.851711,1975054,1,8.253536
8,The Lord of the Rings: The Fellowship of the Ring,8.802732,1791516,4,8.021019
9,"The Good, the Bad and the Ugly",8.794596,739715,0,6.961082


## Additon of a column with rating adjusted by the number of votes

In [323]:
#Addition of a column where the rating is corrected with the number of votes

df_8['vote_adjusted_rating'] = df_8['rating'] - ((2573229 - df_8['vote']) / 1000000 )


In [324]:
df_8

Unnamed: 0,movie_title,rating,vote,oscars,vote_adjusted_rating
0,The Shawshank Redemption,9.233466,2573229,0,9.233466
1,The Godfather,9.155501,1771940,3,8.354212
2,The Dark Knight,8.983874,2539863,2,8.950508
3,The Godfather Part II,8.983774,1225326,6,7.635871
4,12 Angry Men,8.946108,760006,0,7.132885
5,Schindler's List,8.934416,1310769,7,7.671956
6,The Lord of the Rings: The Return of the King,8.919516,1769931,11,8.116218
7,Pulp Fiction,8.851711,1975054,1,8.253536
8,The Lord of the Rings: The Fellowship of the Ring,8.802732,1791516,4,8.021019
9,"The Good, the Bad and the Ugly",8.794596,739715,0,6.961082


## Additon of a column with rating adjusted by the number of oscars

In [325]:
#Checking data types in df_8 dataframe

df_8.dtypes

movie_title              object
rating                  float64
vote                      int64
oscars                    int64
vote_adjusted_rating    float64
dtype: object

In [326]:
#Changing the data type in oscars column to integer

df_8['oscars'] = df_8['oscars'].astype('int')

In [327]:
#Checking data types again

df_8.dtypes

movie_title              object
rating                  float64
vote                      int64
oscars                    int64
vote_adjusted_rating    float64
dtype: object

In [328]:
#Extration of the rating and oscars data into lists

rating = df_8['rating'].tolist()
print(rating)
awards = df_8['oscars'].tolist()
print(awards)

[9.23346600590828, 9.155501417331891, 8.983873850881151, 8.983773844564789, 8.94610846505409, 8.934415502901949, 8.91951572818005, 8.851711378367789, 8.802732315848651, 8.79459562525781, 8.76720559529277, 8.7508298777267, 8.73288299824527, 8.726580031390249, 8.701896767824769, 8.67124776204521, 8.65062482244309, 8.6419973674633, 8.603628127118679, 8.59955406077969]
[0, 3, 2, 6, 0, 7, 11, 1, 4, 0, 6, 0, 4, 2, 0, 4, 1, 5, 0, 0]


In [329]:
# Generation of a list containing the rating values corrected by the number of oscars according to the required logic

import itertools

y = []

for (i, j) in zip(awards, rating):
    
        if i == 0:
            x = j
            
        if i == 1 or i == 2:
            x = j + 0.3
            
        if 3 <= i <= 5:
            x = j + 0.5
            
        if 6 <= i <= 10:
            x = j + 1
            
        if i > 10:
            x = j + 1.5
            
        y.append(x)

print(y)


[9.23346600590828, 9.655501417331891, 9.283873850881152, 9.983773844564789, 8.94610846505409, 9.934415502901949, 10.41951572818005, 9.15171137836779, 9.302732315848651, 8.79459562525781, 9.76720559529277, 8.7508298777267, 9.23288299824527, 9.02658003139025, 8.701896767824769, 9.17124776204521, 8.95062482244309, 9.1419973674633, 8.603628127118679, 8.59955406077969]


In [333]:
#Generation of a pandas dataframe from the list (y) with the oscar corrected rating values. 

df_9 = pd.DataFrame(y, columns =['oscar_adjusted_rating'])
df_9

Unnamed: 0,oscar_adjusted_rating
0,9.233466
1,9.655501
2,9.283874
3,9.983774
4,8.946108
5,9.934416
6,10.419516
7,9.151711
8,9.302732
9,8.794596


In [334]:
#Merging df_8 and df_9 dataframes to obtain the final dataframe with all necessary values

df_final = pd.concat([df_8, df_9], axis=1)

df_final

Unnamed: 0,movie_title,rating,vote,oscars,vote_adjusted_rating,oscar_adjusted_rating
0,The Shawshank Redemption,9.233466,2573229,0,9.233466,9.233466
1,The Godfather,9.155501,1771940,3,8.354212,9.655501
2,The Dark Knight,8.983874,2539863,2,8.950508,9.283874
3,The Godfather Part II,8.983774,1225326,6,7.635871,9.983774
4,12 Angry Men,8.946108,760006,0,7.132885,8.946108
5,Schindler's List,8.934416,1310769,7,7.671956,9.934416
6,The Lord of the Rings: The Return of the King,8.919516,1769931,11,8.116218,10.419516
7,Pulp Fiction,8.851711,1975054,1,8.253536,9.151711
8,The Lord of the Rings: The Fellowship of the Ring,8.802732,1791516,4,8.021019,9.302732
9,"The Good, the Bad and the Ugly",8.794596,739715,0,6.961082,8.794596


In [337]:
# Setting arbitrary decimals

df_final = df_final.round(decimals = 2)
df_final

Unnamed: 0,movie_title,rating,vote,oscars,vote_adjusted_rating,oscar_adjusted_rating
0,The Shawshank Redemption,9.23,2573229,0,9.23,9.23
1,The Godfather,9.16,1771940,3,8.35,9.66
2,The Dark Knight,8.98,2539863,2,8.95,9.28
3,The Godfather Part II,8.98,1225326,6,7.64,9.98
4,12 Angry Men,8.95,760006,0,7.13,8.95
5,Schindler's List,8.93,1310769,7,7.67,9.93
6,The Lord of the Rings: The Return of the King,8.92,1769931,11,8.12,10.42
7,Pulp Fiction,8.85,1975054,1,8.25,9.15
8,The Lord of the Rings: The Fellowship of the Ring,8.8,1791516,4,8.02,9.3
9,"The Good, the Bad and the Ugly",8.79,739715,0,6.96,8.79


In [338]:
#Exporting to result in a csv file

df_final.to_csv('imdb_TF_final.csv')