# Web Scraping Tutorial using IMDb

In [1]:
import pandas as pd
from requests import get

In [2]:
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
headers = {"Accept-Language": "en-US, en;q=0.5",
           "User-Agent": "Promeos"}

response = get(url, headers)
print(response.text[:500])



<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle",


In [3]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
type(soup)

bs4.BeautifulSoup

In [4]:
movie_containers = soup.find_all('div', class_='lister-item mode-advanced')

In [5]:
first_movie = movie_containers[0]

In [6]:
# Movie name
movie_name = first_movie.h3.a.string

In [7]:
# Movie Release Date
release_date = first_movie.h3.find('span', class_='lister-item-year text-muted unbold').string

In [8]:
# IMDB Rating
imdb_rating = float(first_movie.strong.string)

In [9]:
# Metascore Rating
meta_rating = int(first_movie.find('span', class_='metascore favorable').string)

In [10]:
first_movie.find('div', class_='inline-block ratings-metascore').span.string

'77        '

In [11]:
# Number of votes
number_of_votes = int(first_movie.find('span', attrs={'name': 'nv'})['data-value'])

In [12]:
# Movie without a Metascore
no_meta_score_movie = movie_containers[17]

In [13]:
no_meta_score_movie.find('div', class_='inline-block ratings-metascore')

In [14]:
no_meta_score_movie.h3.a.string

'Money Heist'

In [15]:
# Iterate through the first 50 movies to display our
# Movie database structure
movie_titles = []
release_dates = []
IMDb_ratings = []
metascores = []
votes = []

for movie in movie_containers:
    if movie.find('div', class_='inline-block ratings-metascore') != None:
        title = movie.h3.a.string
        movie_titles.append(title)
        
        release_date = movie.h3.find('span', class_='lister-item-year text-muted unbold').string
        release_dates.append(release_date)
        
        IMDb_rating = float(movie.strong.string)
        IMDb_ratings.append(IMDb_rating)
        
        metascore = int(movie.find('div', class_='inline-block ratings-metascore').span.string)
        metascores.append(metascore)
        
        vote = int(movie.find('span', attrs={'name': 'nv'})['data-value'])
        votes.append(vote)
    else:
        continue

movie_db_2017 = pd.DataFrame({'title' : movie_titles,
                              'release_date' : release_dates,
                              'imdb_rating' : IMDb_ratings,
                              'metascore' : metascores,
                              'votes' : votes}
                            )

In [16]:
movie_db_2017

Unnamed: 0,title,release_date,imdb_rating,metascore,votes
0,Logan,(2017),8.1,77,636299
1,Thor: Ragnarok,(2017),7.9,74,574253
2,Guardians of the Galaxy Vol. 2,(2017),7.6,67,558519
3,Star Wars: Episode VIII - The Last Jedi,(2017),7.0,84,554052
4,Dunkirk,(2017),7.9,94,542019
5,Wonder Woman,(2017),7.4,76,541890
6,Spider-Man: Homecoming,(2017),7.4,73,501407
7,Get Out,(I) (2017),7.7,85,482169
8,It,(I) (2017),7.3,69,452138
9,Blade Runner 2049,(2017),8.0,81,450220


In [17]:
movie_db_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         42 non-null     object 
 1   release_date  42 non-null     object 
 2   imdb_rating   42 non-null     float64
 3   metascore     42 non-null     int64  
 4   votes         42 non-null     int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 1.8+ KB


## Creating a script for multiple pages
Now that we understand the structure of a single page and know how to address missing values, let's create a script to scrape multuple web pages.
```python
'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
```
The URL reads: Seach by titles released in 2017 and sort them by the number of votes in descending order on page 1.
To create an proper script we'll only need the release_date and page numbers to change as we iterate web scraping.

In [18]:
release_date = [str(year) for year in range(2000, 2020)]
pages = [str(page) for page in range(1, 5)]

In [19]:
from time import sleep
from random import randint

In [20]:
from time import time
from IPython.core.display import clear_output

In [21]:
start_time = time()

In [22]:
requests = 0

In [23]:
for _ in range(5):
    requests += 1
    sleep(randint(1,3))
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait=True)

Request: 5; Frequency: 0.45339408762912553 requests/s


In [24]:
from warnings import warn
warn("Warning Simulation")

  


In [25]:
movie_titles = []
release_dates = []
IMDb_ratings = []
metascores = []
votes = []

start_time = time()
requests = 0

for year in release_date:
    for page in pages:
        url = f'http://www.imdb.com/search/title?release_date={year}&sort=num_votes,desc&page={page}'
        headers = {"Accept-Language": "en-US, en;q=0.5",
                   "User-Agent": "Promeos"}

        response = get(url, headers)
        
        sleep(randint(8, 15))
        
        requests += 1
        elapsed_time = time() - start_time
        print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait=True)
        
        if response.status_code != 200:
            warn(f"Request{response}, Status Code {response.status_code}")
            
        html_movies = BeautifulSoup(response.text, 'html.parser')
        
        mv_containers = html_movies.find_all('div', class_='lister-item mode-advanced')
        
        for movie in mv_containers:
            has_metascore = movie.find('div', class_='inline-block ratings-metascore') != None
            if has_metascore:
                title = movie.h3.a.string
                movie_titles.append(title)

                release_date = movie.h3.find('span', class_='lister-item-year text-muted unbold').string
                release_dates.append(release_date)

                IMDb_rating = float(movie.strong.string)
                IMDb_ratings.append(IMDb_rating)

                metascore = int(movie.find('div', class_='inline-block ratings-metascore').span.string)
                metascores.append(metascore)

                vote = int(movie.find('span', attrs={'name': 'nv'})['data-value'])
                votes.append(vote)
            else:
                continue

movie_db = pd.DataFrame({'title' : movie_titles,
                         'release_date' : release_dates,
                         'imdb_rating' : IMDb_ratings,
                         'metascore' : metascores,
                         'votes' : votes})

Request: 80; Frequency: 0.07869433769554489 requests/s


In [27]:
movie_db.shape

(3620, 5)

In [28]:
movie_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3620 entries, 0 to 3619
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         3620 non-null   object 
 1   release_date  3620 non-null   object 
 2   imdb_rating   3620 non-null   float64
 3   metascore     3620 non-null   int64  
 4   votes         3620 non-null   int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 141.5+ KB


In [29]:
movie_db.to_csv('movie_db.csv')