In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_genre_first_page(genre):
    start = 'https://www.imdb.com/search/title/?genres='
    end = '&title_type=feature&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=facfbd0c-6f3d-4c05-9348-22eebd58852e&pf_rd_r=VAYJ7QXVS6BEX8377ERW&pf_rd_s=center-6&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_mvpop_1'
    genre_url = start + genre + end
    response = requests.get(genre_url)
    
    if not response.ok:
        print('Status Code:', response.status_code)
        raise Exception('Failed to fetch web page' + genre_url)
    
    doc = BeautifulSoup(response.text)
    return doc


def get_genre_nth_page(genre, n):
    start = 'https://www.imdb.com/search/title/?title_type=feature&genres='
    mid = '&start='
    num = str((n-1)*50 + 1)
    end = '&explore=genres&ref_=adv_nxt'
    genre_url = start + genre + mid + num + end
    response = requests.get(genre_url)
    
    if not response.ok:
        print('Status Code:', response.status_code)
        raise Exception('Failed to get web page' + genre_url)
    
    doc = BeautifulSoup(response.text)
    
    return doc

In [2]:
doc1 = get_genre_first_page('action')
doc2 = get_genre_nth_page('action', 2)

In [3]:
type(doc1)

bs4.BeautifulSoup

In [4]:
doc1.title.text

'Top 50 Action Movies - IMDb'

In [5]:
movie_tags = doc1.find_all('div', class_ = 'lister-item mode-advanced')
len(movie_tags)

50

In [6]:
sample_h3_tag = movie_tags[0].find('h3')
sample_h3_tag

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt1462764/">Indiana Jones and the Dial of Destiny</a>
<span class="lister-item-year text-muted unbold">(2023)</span>
</h3>

In [7]:
sample_name = sample_h3_tag.find('a').text.strip()
sample_name

'Indiana Jones and the Dial of Destiny'

In [8]:
imdb_url = 'https://www.imdb.com'
sample_url = imdb_url + sample_h3_tag.find('a')['href']
sample_url

'https://www.imdb.com/title/tt1462764/'

In [9]:
def find_year(movie_tag):
    try:
        year = int(movie_tag.find('span', class_ = 'lister-item-year text-muted unbold').text[-5:-1])
    except AttributeError:
        year = None
    except ValueError:
        year = None
        
    return year

In [10]:
find_year(movie_tags[0])

2023

In [11]:
sample_genre = movie_tags[0].find('span', class_ = 'genre').text.strip()
sample_genre

'Action, Adventure'

In [12]:
def find_duration(movie_tag):
    try:
        duration = movie_tag.find('span', class_ = 'runtime').text.strip()
    except AttributeError:
        duration = None
        
    return duration

In [13]:
find_duration(movie_tags[0])

'154 min'

In [14]:
def find_imdb_rating(movie_tag):
    try:
        imdb_rating = float(movie_tag.find('div', class_ = 'inline-block ratings-imdb-rating').text.strip())
    except AttributeError:
        imdb_rating = None
        
    return imdb_rating

In [15]:
find_imdb_rating(movie_tags[0])

6.9

In [16]:
def find_directors(movie_tag):
    try:
        paragraph = movie_tag.find('p', class_ = '').get_text().strip().split('\n')
        directors = ''.join(paragraph[1 : paragraph.index('| ')])
        directors = directors.replace(',', ';')
    except ValueError:
        directors = None
        
    return directors

def find_actors(movie_tag):
    try:
        paragraph = movie_tag.find('p', class_ = '').get_text().strip().split('\n')
        actors = ''.join(paragraph[paragraph.index('    Stars:')+1 :])
        actors = actors.replace(',', ';')
    except ValueError:
        actors = None
        
    return actors

In [17]:
find_directors(movie_tags[0])

'James Mangold'

In [18]:
find_actors(movie_tags[0])

'Harrison Ford; Phoebe Waller-Bridge; Antonio Banderas; Karen Allen'

In [19]:
def find_votes(movie_tag):
    try:
        votes = movie_tag.find('p', class_ = 'sort-num_votes-visible').get_text().strip().split('\n')[1]
        if ',' in votes:
            votes = int(votes.replace(',' , ''))
        else:
            votes = int(votes)
    except AttributeError:
        votes = None
    except IndexError:
        votes = None
        
    return votes

In [20]:
find_votes(movie_tags[0])

37254

In [21]:
# Following is the information extracted for the first movie tag:
print('Movie Name:', sample_name)
print("IMDB URL:", sample_url)
print('Year of Release:', find_year(movie_tags[0]))
print('Duration:', find_duration(movie_tags[0]))
print('Genre:', sample_genre)
print('IMDB Rating:', find_imdb_rating(movie_tags[0]))
print('Director:', find_directors(movie_tags[0]))
print('Actors:', find_actors(movie_tags[0]))
print('Votes:', find_votes(movie_tags[0]))

Movie Name: Indiana Jones and the Dial of Destiny
IMDB URL: https://www.imdb.com/title/tt1462764/
Year of Release: 2023
Duration: 154 min
Genre: Action, Adventure
IMDB Rating: 6.9
Director: James Mangold
Actors: Harrison Ford; Phoebe Waller-Bridge; Antonio Banderas; Karen Allen
Votes: 37254


In [22]:
#Function to write all the information into a dictionary

def parse_movie(movie_tag):
    
    h3_tag = movie_tag.find('h3')
    name = h3_tag.find('a').text.strip().replace(',', '')
    imdb_url = 'https://www.imdb.com'
    url = imdb_url + h3_tag.find('a')['href']
    year = find_year(movie_tag)
    
    duration = find_duration(movie_tag)
    genre = movie_tag.find('span', class_ = 'genre').text.strip().replace(',', ';')
    imdb_rating = find_imdb_rating(movie_tag)
    
    directors = find_directors(movie_tag)
    actors = find_actors(movie_tag)
    votes = find_votes(movie_tag)
    
    #Return a Dictionary
    
    return{
        'Movie Name': name,
        'IMDB URL': url,
        'Year of Release': year,
        'Duration': duration,
        'Genre': genre,
        'IMDB Rating': imdb_rating,
        'Directors': directors,
        'Actors': actors,
        'Votes': votes
    }

In [23]:
parse_movie(movie_tags[2])

{'Movie Name': 'Spider-Man: Across the Spider-Verse',
 'IMDB URL': 'https://www.imdb.com/title/tt9362722/',
 'Year of Release': 2023,
 'Duration': '140 min',
 'Genre': 'Animation; Action; Adventure',
 'IMDB Rating': 8.9,
 'Directors': 'Joaquim Dos Santos; Kemp Powers; Justin K. Thompson',
 'Actors': 'Shameik Moore; Hailee Steinfeld; Brian Tyree Henry; Luna Lauren Velez',
 'Votes': 167206}

In [24]:
top_action_movies = [parse_movie(tag) for tag in movie_tags]
top_action_movies[:5]

[{'Movie Name': 'Indiana Jones and the Dial of Destiny',
  'IMDB URL': 'https://www.imdb.com/title/tt1462764/',
  'Year of Release': 2023,
  'Duration': '154 min',
  'Genre': 'Action; Adventure',
  'IMDB Rating': 6.9,
  'Directors': 'James Mangold',
  'Actors': 'Harrison Ford; Phoebe Waller-Bridge; Antonio Banderas; Karen Allen',
  'Votes': 37254},
 {'Movie Name': 'The Flash',
  'IMDB URL': 'https://www.imdb.com/title/tt0439572/',
  'Year of Release': 2023,
  'Duration': '144 min',
  'Genre': 'Action; Adventure; Fantasy',
  'IMDB Rating': 7.2,
  'Directors': 'Andy Muschietti',
  'Actors': 'Ezra Miller; Michael Keaton; Sasha Calle; Michael Shannon',
  'Votes': 80004},
 {'Movie Name': 'Spider-Man: Across the Spider-Verse',
  'IMDB URL': 'https://www.imdb.com/title/tt9362722/',
  'Year of Release': 2023,
  'Duration': '140 min',
  'Genre': 'Animation; Action; Adventure',
  'IMDB Rating': 8.9,
  'Directors': 'Joaquim Dos Santos; Kemp Powers; Justin K. Thompson',
  'Actors': 'Shameik Moore;

Below is a function to get the information of top 100 movies from a BeautifulSoup object as input:

In [25]:
def get_top_movies(doc):
    
    # Obtain a list of movie tags from the html source code
    movie_tags = doc.find_all('div', class_ = 'lister-item mode-advanced')
    
    # Create a list of dictionaries containing movie information
    top_movies = [parse_movie(tag) for tag in movie_tags]
    return top_movies

Finally, below function uses all the functions defined above to give the list of top 100 movies for any Genre:

In [26]:
def get_n_pages(genre, n):
    
    #Getting a list of top 50 movies from first page
    doc1 = get_genre_first_page(genre)
    top_movies = get_top_movies(doc1)
    
    #Getting a list of movies from the next (n-1) pages
    for i in range(2,n+1):
        doc = get_genre_nth_page(genre, i)
        top_movies_n = get_top_movies(doc)
        top_movies += top_movies_n
    
    return top_movies

In [27]:
top_200_movies = get_n_pages('comedy', 4)
len(top_200_movies)

200

In [28]:
top_200_movies[:5]

[{'Movie Name': 'No Hard Feelings',
  'IMDB URL': 'https://www.imdb.com/title/tt15671028/',
  'Year of Release': 2023,
  'Duration': '103 min',
  'Genre': 'Comedy; Romance',
  'IMDB Rating': 6.8,
  'Directors': 'Gene Stupnitsky',
  'Actors': 'Jennifer Lawrence; Andrew Barth Feldman; Laura Benanti; Matthew Broderick',
  'Votes': 13394},
 {'Movie Name': 'Asteroid City',
  'IMDB URL': 'https://www.imdb.com/title/tt14230388/',
  'Year of Release': 2023,
  'Duration': '105 min',
  'Genre': 'Comedy; Drama; Romance',
  'IMDB Rating': 7.1,
  'Directors': 'Wes Anderson',
  'Actors': 'Jason Schwartzman; Scarlett Johansson; Tom Hanks; Jeffrey Wright',
  'Votes': 23536},
 {'Movie Name': 'Barbie',
  'IMDB URL': 'https://www.imdb.com/title/tt1517268/',
  'Year of Release': 2023,
  'Duration': '114 min',
  'Genre': 'Adventure; Comedy; Fantasy',
  'IMDB Rating': None,
  'Directors': 'Greta Gerwig',
  'Actors': 'Margot Robbie; Kingsley Ben-Adir; Ryan Gosling; Emma Mackey',
  'Votes': None},
 {'Movie Na

## Write Information to CSV

Below is a function which can be used to write all the extracted information into a csv file:

In [29]:
def write_csv(items, path):
    with open(path, 'w') as f:
        if len(items) == 0:
            return
        
        headers = ','.join(list(items[0].keys()))
        f.write(headers + '\n')
        
        for item in items:
            values = []
            for value in item.values():
                values.append(str(value))
            f.write(','.join(values) + '\n')

In [30]:
movies = pd.DataFrame(top_200_movies)

In [31]:
movies.head(3)

Unnamed: 0,Movie Name,IMDB URL,Year of Release,Duration,Genre,IMDB Rating,Directors,Actors,Votes
0,No Hard Feelings,https://www.imdb.com/title/tt15671028/,2023,103 min,Comedy; Romance,6.8,Gene Stupnitsky,Jennifer Lawrence; Andrew Barth Feldman; Laura...,13394.0
1,Asteroid City,https://www.imdb.com/title/tt14230388/,2023,105 min,Comedy; Drama; Romance,7.1,Wes Anderson,Jason Schwartzman; Scarlett Johansson; Tom Han...,23536.0
2,Barbie,https://www.imdb.com/title/tt1517268/,2023,114 min,Adventure; Comedy; Fantasy,,Greta Gerwig,Margot Robbie; Kingsley Ben-Adir; Ryan Gosling...,


In [32]:
movies = movies[['Movie Name','Genre']]

In [33]:
movies.columns = ['title','genres']

In [34]:
# write_csv(top_200_movies, 'top-200-movies.csv')

movies.to_csv('movies_data.csv')

### Use Pandas to Analyse the Data

We can now view all the extracted information in the form of a data frame using the pandas library

In [35]:
import pandas as pd

In [36]:
pd.read_csv('top-200-movies.csv').head()

FileNotFoundError: [Errno 2] No such file or directory: 'top-200-movies.csv'

## The Last Leg

We can now modify our function to do all of the following:

* Download the web page from IMDB
* Extract data for top 100 movies from the web page
* Write the data into a CSV file
* Show the data as a pandas dataframe

In [50]:
def get_n_pages(genre, n):
    
    #Getting a list of top 50 movies from first page
    doc1 = get_genre_first_page(genre)
    top_movies = get_top_movies(doc1)
    
    #Getting a list of movies from the next (n-1) pages
    for i in range(2,n+1):
        doc = get_genre_nth_page(genre, i)
        top_movies_n = get_top_movies(doc)
        top_movies += top_movies_n
    
    #Writing the data into a csv file
    write_csv(top_movies, f'top-{genre}-movies.csv')
    
    return pd.read_csv(f'top-{genre}-movies.csv')

In [51]:
get_n_pages('action', 4)

Unnamed: 0,Movie Name,IMDB URL,Year of Release,Duration,Genre,IMDB Rating,Directors,Actors,Votes
0,Indiana Jones and the Dial of Destiny,https://www.imdb.com/title/tt1462764/,2023,154 min,Action; Adventure,6.9,James Mangold,Harrison Ford; Phoebe Waller-Bridge; Antonio B...,37047
1,The Flash,https://www.imdb.com/title/tt0439572/,2023,144 min,Action; Adventure; Fantasy,7.2,Andy Muschietti,Ezra Miller; Michael Keaton; Sasha Calle; Mich...,79952
2,Spider-Man: Across the Spider-Verse,https://www.imdb.com/title/tt9362722/,2023,140 min,Animation; Action; Adventure,8.9,Joaquim Dos Santos; Kemp Powers; Justin K. Tho...,Shameik Moore; Hailee Steinfeld; Brian Tyree H...,167133
3,Extraction 2,https://www.imdb.com/title/tt12263384/,2023,122 min,Action; Thriller,7.1,Sam Hargrave,Chris Hemsworth; Golshifteh Farahani; Adam Bes...,79746
4,Avatar: The Way of Water,https://www.imdb.com/title/tt1630029/,2022,192 min,Action; Adventure; Fantasy,7.6,James Cameron,Sam Worthington; Zoe Saldana; Sigourney Weaver...,424292
...,...,...,...,...,...,...,...,...,...
195,The Incredibles,https://www.imdb.com/title/tt0317705/,2004,115 min,Animation; Action; Adventure,8.0,Brad Bird,Craig T. Nelson; Samuel L. Jackson; Holly Hunt...,766624
196,Mortal Kombat,https://www.imdb.com/title/tt0293429/,2021,110 min,Action; Adventure; Fantasy,6.1,Simon McQuoid,Lewis Tan; Jessica McNamee; Josh Lawson; Joe T...,182117
197,The Maze Runner,https://www.imdb.com/title/tt1790864/,2014,113 min,Action; Mystery; Sci-Fi,6.8,Wes Ball,Dylan O'Brien; Kaya Scodelario; Will Poulter; ...,486015
198,Polite Society,https://www.imdb.com/title/tt18257464/,2023,104 min,Action; Comedy,6.7,Nida Manzoor,Priya Kansara; Ritu Arya; Renu Brindle; Rekha ...,4929
