In [5]:
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import requests
import matplotlib.pyplot as plt
from config import key

In [6]:
# Create a browser for web scrapping.

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

[WDM] - Downloading: 100%|████████████████████████████████████████████████████████| 6.58M/6.58M [00:05<00:00, 1.35MB/s]


In [7]:
# Use browser to visit the iMDB Top 250 web page.

url = 'https://www.imdb.com/chart/top?sort=ir,desc&mode=simple&page=1'
browser.visit(url)

In [8]:
# Soupify the web page.

html = browser.html
imdb = BeautifulSoup(html)
type(imdb)

bs4.BeautifulSoup

In [9]:
# Find the table list of top 250 movies.

top_movies = imdb.find("table", {"data-caller-name":"chart-top250movie"}).find("tbody").find_all("tr")
len(top_movies)

250

In [10]:
# Test scrape to find the Movie ID for the first movie in the Top 250 list.

test = top_movies[0]
movie_id1 = test.find("td", {"class": "watchlistColumn"}).find("div")["data-tconst"]
movie_id1

'tt0111161'

In [11]:
# Web scrapping loop to extract movie titles and corresponding movie_id.

top_250_ids = []

for movie in top_movies:
    title = movie.find("td", {"class": "titleColumn"}).find("a").text
    movie_id = movie.find("td", {"class": "watchlistColumn"}).find("div")["data-tconst"]
    
    row = {}
    row["title"] = title
    row["movie_id"] = movie_id
    top_250_ids.append(row)
    
movie_ids_df = pd.DataFrame(top_250_ids)
movie_ids_df

Unnamed: 0,title,movie_id
0,The Shawshank Redemption,tt0111161
1,The Godfather,tt0068646
2,The Dark Knight,tt0468569
3,The Godfather Part II,tt0071562
4,12 Angry Men,tt0050083
...,...,...
245,Dersu Uzala,tt0071411
246,The Iron Giant,tt0129167
247,The Help,tt1454029
248,Aladdin,tt0103639


In [12]:
browser.quit()

In [13]:
# Using the OMDB API to acquire movie metadata. Initial test with The Shawshank Redemption movie ID.

api_url = f"http://www.omdbapi.com/?apikey={key}&i=tt0111161"
response = requests.get(api_url)
response.json()

{'Title': 'The Shawshank Redemption',
 'Year': '1994',
 'Rated': 'R',
 'Released': '14 Oct 1994',
 'Runtime': '142 min',
 'Genre': 'Drama',
 'Director': 'Frank Darabont',
 'Writer': 'Stephen King, Frank Darabont',
 'Actors': 'Tim Robbins, Morgan Freeman, Bob Gunton',
 'Plot': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
 'Language': 'English',
 'Country': 'United States',
 'Awards': 'Nominated for 7 Oscars. 21 wins & 43 nominations total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '9.3/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '91%'},
  {'Source': 'Metacritic', 'Value': '81/100'}],
 'Metascore': '81',
 'imdbRating': '9.3',
 'imdbVotes': '2,662,343',
 'imdbID': 'tt0111161',
 'Type': 'movie',
 'DVD': '21 Dec 1999',
 'BoxOffice': '$28,767,189',
 'P

In [14]:
# Another API test to look at getting single attributes of an individual movie.

id_code = "tt0111161"
api_url = f"http://www.omdbapi.com/?apikey={key}&i={id_code}"
response = requests.get(api_url)
movie_title = response.json().get("Title")
year_released = response.json().get("Year")
rating = response.json().get("Rated")
genre = response.json().get("Genre")
imdbRating = response.json().get("imdbRating")
Metascore = response.json().get("Metascore")
release_date = response.json().get("Released")
country = response.json().get("Country")
language = response.json().get("Language")
director = response.json().get("Director")
actors = response.json().get("Actors")
imdbVotes = response.json().get("imdbVotes")
earnings = response.json().get("BoxOffice")
imdbID = response.json().get("imdbID")

print(movie_title)
print(year_released)
print(rating)
print(genre)
print(imdbRating)
print(Metascore)
print(release_date)
print(country)
print(language)
print(director)
print(actors)
print(imdbVotes)
print(earnings)
print(imdbID)

The Shawshank Redemption
1994
R
Drama
9.3
81
14 Oct 1994
United States
English
Frank Darabont
Tim Robbins, Morgan Freeman, Bob Gunton
2,662,343
$28,767,189
tt0111161


In [15]:
movie_id_list = movie_ids_df['movie_id'].tolist()
len(movie_id_list)

250

In [17]:
# Loop to extract all the metadata attributes associated with all of the top 250 movies, and turn into dataframe. 

rows = []

for x in movie_id_list:
    api_url = f"http://www.omdbapi.com/?apikey={key}&i=" + x
    response = requests.get(api_url)
    data = response.json()
    
    rows.append(data)
    
final_df = pd.DataFrame(rows)
final_df

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,...,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,Drama,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton",Two imprisoned men bond over a number of years...,...,81,9.3,2662343,tt0111161,movie,21 Dec 1999,"$28,767,189",,,True
1,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola","Marlon Brando, Al Pacino, James Caan",The aging patriarch of an organized crime dyna...,...,100,9.2,1847150,tt0068646,movie,11 May 2004,"$136,381,073",,,True
2,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Goyer","Christian Bale, Heath Ledger, Aaron Eckhart",When the menace known as the Joker wreaks havo...,...,84,9.0,2638780,tt0468569,movie,09 Dec 2008,"$534,987,076",,,True
3,The Godfather Part II,1974,R,18 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola, Mario Puzo","Al Pacino, Robert De Niro, Robert Duvall",The early life and career of Vito Corleone in ...,...,90,9.0,1265087,tt0071562,movie,24 May 2005,"$47,834,595",,,True
4,12 Angry Men,1957,Approved,10 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,Reginald Rose,"Henry Fonda, Lee J. Cobb, Martin Balsam",The jury in a New York City murder trial is fr...,...,96,9.0,786180,tt0050083,movie,04 Mar 2008,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Dersu Uzala,1975,G,20 Dec 1977,142 min,"Adventure, Biography, Drama",Akira Kurosawa,"Akira Kurosawa, Yuriy Nagibin, Vladimir Arsenev","Maksim Munzuk, Yuriy Solomin, Mikhail Bychkov",The Russian army sends an explorer on an exped...,...,,8.2,30160,tt0071411,movie,22 Mar 2007,,,,True
246,The Iron Giant,1999,PG,06 Aug 1999,86 min,"Animation, Action, Adventure",Brad Bird,"Tim McCanlies, Brad Bird, Ted Hughes","Eli Marienthal, Harry Connick Jr., Jennifer An...",A young boy befriends a giant robot from outer...,...,85,8.1,198212,tt0129167,movie,27 Sep 2005,"$23,315,035",,,True
247,The Help,2011,PG-13,10 Aug 2011,146 min,Drama,Tate Taylor,"Tate Taylor, Kathryn Stockett","Viola Davis, Emma Stone, Octavia Spencer",An aspiring author during the civil rights mov...,...,62,8.1,459753,tt1454029,movie,06 Dec 2011,"$169,708,112",,,True
248,Aladdin,1992,G,25 Nov 1992,90 min,"Animation, Adventure, Comedy","Ron Clements, John Musker","Ron Clements, John Musker, Ted Elliott","Scott Weinger, Robin Williams, Linda Larkin",A kindhearted street urchin and a power-hungry...,...,86,8.0,420013,tt0103639,movie,13 Oct 2015,"$217,350,219",,,True


In [18]:
pd.set_option('display.max_columns', None)
final_df

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,Drama,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton",Two imprisoned men bond over a number of years...,English,United States,Nominated for 7 Oscars. 21 wins & 43 nominatio...,https://m.media-amazon.com/images/M/MV5BMDFkYT...,"[{'Source': 'Internet Movie Database', 'Value'...",81,9.3,2662343,tt0111161,movie,21 Dec 1999,"$28,767,189",,,True
1,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola","Marlon Brando, Al Pacino, James Caan",The aging patriarch of an organized crime dyna...,"English, Italian, Latin",United States,Won 3 Oscars. 32 wins & 30 nominations total,https://m.media-amazon.com/images/M/MV5BM2MyNj...,"[{'Source': 'Internet Movie Database', 'Value'...",100,9.2,1847150,tt0068646,movie,11 May 2004,"$136,381,073",,,True
2,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Goyer","Christian Bale, Heath Ledger, Aaron Eckhart",When the menace known as the Joker wreaks havo...,"English, Mandarin","United States, United Kingdom",Won 2 Oscars. 160 wins & 163 nominations total,https://m.media-amazon.com/images/M/MV5BMTMxNT...,"[{'Source': 'Internet Movie Database', 'Value'...",84,9.0,2638780,tt0468569,movie,09 Dec 2008,"$534,987,076",,,True
3,The Godfather Part II,1974,R,18 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola, Mario Puzo","Al Pacino, Robert De Niro, Robert Duvall",The early life and career of Vito Corleone in ...,"English, Italian, Spanish, Latin, Sicilian",United States,Won 6 Oscars. 17 wins & 21 nominations total,https://m.media-amazon.com/images/M/MV5BMWMwMG...,"[{'Source': 'Internet Movie Database', 'Value'...",90,9.0,1265087,tt0071562,movie,24 May 2005,"$47,834,595",,,True
4,12 Angry Men,1957,Approved,10 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,Reginald Rose,"Henry Fonda, Lee J. Cobb, Martin Balsam",The jury in a New York City murder trial is fr...,English,United States,Nominated for 3 Oscars. 17 wins & 13 nominatio...,https://m.media-amazon.com/images/M/MV5BMWU4N2...,"[{'Source': 'Internet Movie Database', 'Value'...",96,9.0,786180,tt0050083,movie,04 Mar 2008,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Dersu Uzala,1975,G,20 Dec 1977,142 min,"Adventure, Biography, Drama",Akira Kurosawa,"Akira Kurosawa, Yuriy Nagibin, Vladimir Arsenev","Maksim Munzuk, Yuriy Solomin, Mikhail Bychkov",The Russian army sends an explorer on an exped...,"Russian, Chinese","Soviet Union, Japan",Won 1 Oscar. 8 wins & 2 nominations total,https://m.media-amazon.com/images/M/MV5BYWY0OW...,"[{'Source': 'Internet Movie Database', 'Value'...",,8.2,30160,tt0071411,movie,22 Mar 2007,,,,True
246,The Iron Giant,1999,PG,06 Aug 1999,86 min,"Animation, Action, Adventure",Brad Bird,"Tim McCanlies, Brad Bird, Ted Hughes","Eli Marienthal, Harry Connick Jr., Jennifer An...",A young boy befriends a giant robot from outer...,English,United States,Won 1 BAFTA Award20 wins & 18 nominations total,https://m.media-amazon.com/images/M/MV5BYzBjZT...,"[{'Source': 'Internet Movie Database', 'Value'...",85,8.1,198212,tt0129167,movie,27 Sep 2005,"$23,315,035",,,True
247,The Help,2011,PG-13,10 Aug 2011,146 min,Drama,Tate Taylor,"Tate Taylor, Kathryn Stockett","Viola Davis, Emma Stone, Octavia Spencer",An aspiring author during the civil rights mov...,English,"United States, India",Won 1 Oscar. 79 wins & 121 nominations total,https://m.media-amazon.com/images/M/MV5BMTM5OT...,"[{'Source': 'Internet Movie Database', 'Value'...",62,8.1,459753,tt1454029,movie,06 Dec 2011,"$169,708,112",,,True
248,Aladdin,1992,G,25 Nov 1992,90 min,"Animation, Adventure, Comedy","Ron Clements, John Musker","Ron Clements, John Musker, Ted Elliott","Scott Weinger, Robin Williams, Linda Larkin",A kindhearted street urchin and a power-hungry...,English,United States,Won 2 Oscars. 34 wins & 22 nominations total,https://m.media-amazon.com/images/M/MV5BY2Q2ND...,"[{'Source': 'Internet Movie Database', 'Value'...",86,8.0,420013,tt0103639,movie,13 Oct 2015,"$217,350,219",,,True


In [19]:
# Remove extraneous columns.

clean_df1 = final_df.drop(columns=['Plot', 'Poster', 'Type', 'DVD', 'Production', 'Website', 'Response'])
clean_df1

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Language,Country,Awards,Ratings,Metascore,imdbRating,imdbVotes,imdbID,BoxOffice
0,The Shawshank Redemption,1994,R,14 Oct 1994,142 min,Drama,Frank Darabont,"Stephen King, Frank Darabont","Tim Robbins, Morgan Freeman, Bob Gunton",English,United States,Nominated for 7 Oscars. 21 wins & 43 nominatio...,"[{'Source': 'Internet Movie Database', 'Value'...",81,9.3,2662343,tt0111161,"$28,767,189"
1,The Godfather,1972,R,24 Mar 1972,175 min,"Crime, Drama",Francis Ford Coppola,"Mario Puzo, Francis Ford Coppola","Marlon Brando, Al Pacino, James Caan","English, Italian, Latin",United States,Won 3 Oscars. 32 wins & 30 nominations total,"[{'Source': 'Internet Movie Database', 'Value'...",100,9.2,1847150,tt0068646,"$136,381,073"
2,The Dark Knight,2008,PG-13,18 Jul 2008,152 min,"Action, Crime, Drama",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Goyer","Christian Bale, Heath Ledger, Aaron Eckhart","English, Mandarin","United States, United Kingdom",Won 2 Oscars. 160 wins & 163 nominations total,"[{'Source': 'Internet Movie Database', 'Value'...",84,9.0,2638780,tt0468569,"$534,987,076"
3,The Godfather Part II,1974,R,18 Dec 1974,202 min,"Crime, Drama",Francis Ford Coppola,"Francis Ford Coppola, Mario Puzo","Al Pacino, Robert De Niro, Robert Duvall","English, Italian, Spanish, Latin, Sicilian",United States,Won 6 Oscars. 17 wins & 21 nominations total,"[{'Source': 'Internet Movie Database', 'Value'...",90,9.0,1265087,tt0071562,"$47,834,595"
4,12 Angry Men,1957,Approved,10 Apr 1957,96 min,"Crime, Drama",Sidney Lumet,Reginald Rose,"Henry Fonda, Lee J. Cobb, Martin Balsam",English,United States,Nominated for 3 Oscars. 17 wins & 13 nominatio...,"[{'Source': 'Internet Movie Database', 'Value'...",96,9.0,786180,tt0050083,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Dersu Uzala,1975,G,20 Dec 1977,142 min,"Adventure, Biography, Drama",Akira Kurosawa,"Akira Kurosawa, Yuriy Nagibin, Vladimir Arsenev","Maksim Munzuk, Yuriy Solomin, Mikhail Bychkov","Russian, Chinese","Soviet Union, Japan",Won 1 Oscar. 8 wins & 2 nominations total,"[{'Source': 'Internet Movie Database', 'Value'...",,8.2,30160,tt0071411,
246,The Iron Giant,1999,PG,06 Aug 1999,86 min,"Animation, Action, Adventure",Brad Bird,"Tim McCanlies, Brad Bird, Ted Hughes","Eli Marienthal, Harry Connick Jr., Jennifer An...",English,United States,Won 1 BAFTA Award20 wins & 18 nominations total,"[{'Source': 'Internet Movie Database', 'Value'...",85,8.1,198212,tt0129167,"$23,315,035"
247,The Help,2011,PG-13,10 Aug 2011,146 min,Drama,Tate Taylor,"Tate Taylor, Kathryn Stockett","Viola Davis, Emma Stone, Octavia Spencer",English,"United States, India",Won 1 Oscar. 79 wins & 121 nominations total,"[{'Source': 'Internet Movie Database', 'Value'...",62,8.1,459753,tt1454029,"$169,708,112"
248,Aladdin,1992,G,25 Nov 1992,90 min,"Animation, Adventure, Comedy","Ron Clements, John Musker","Ron Clements, John Musker, Ted Elliott","Scott Weinger, Robin Williams, Linda Larkin",English,United States,Won 2 Oscars. 34 wins & 22 nominations total,"[{'Source': 'Internet Movie Database', 'Value'...",86,8.0,420013,tt0103639,"$217,350,219"


In [21]:
# Convert numeric values to floats.

clean_df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Title       250 non-null    object
 1   Year        250 non-null    object
 2   Rated       250 non-null    object
 3   Released    250 non-null    object
 4   Runtime     250 non-null    object
 5   Genre       250 non-null    object
 6   Director    250 non-null    object
 7   Writer      250 non-null    object
 8   Actors      250 non-null    object
 9   Language    250 non-null    object
 10  Country     250 non-null    object
 11  Awards      250 non-null    object
 12  Ratings     250 non-null    object
 13  Metascore   250 non-null    object
 14  imdbRating  250 non-null    object
 15  imdbVotes   250 non-null    object
 16  imdbID      250 non-null    object
 17  BoxOffice   250 non-null    object
dtypes: object(18)
memory usage: 35.3+ KB


In [None]:
final_df.to_csv('imdb_top250_all_data.csv', index=False)