In [1]:
#import Libraries
import requests
from requests import get
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup

In [2]:
headers= {"Accept-Language":"en-US, en; q=0.5"} #To get the english translated movie titles

In [3]:
url="https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"
results= requests.get(url,headers=headers) #gets the contents from the web page url

In [4]:
'''BeautifulSoup methods uses the html.parser to let python read the contents of the page as per HTML components structure
and also specifies the desired formaat of the results to be stored in'''
soup=BeautifulSoup(results.text,"html.parser")

In [5]:
print(type(soup))
print(soup.prettify()) # prints the web page contents sin a tree structure of HTML

<class 'bs4.BeautifulSoup'>
<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   IMDb "Top 1000"
(Sorted by Popularity Ascending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/search/title/?groups=top_1000" rel="canonical"/>
 

In [6]:
#Creating storage for the desired contents of the web page
titles=[]
time=[]
years=[]
imdb_ratings=[]
votes=[]
meta_scores=[]
us_gross=[]
ranks=[]

In [7]:
'''Inspect the webpage you want to scrape using inspect on the browser and know the tags where your contents reside in'''
movie_divs=soup.find_all('div', class_='lister-item mode-advanced')
#extracts all the div containers that have a class attribute of lister-item mode-advanced

In [8]:
print(type(movie_divs))
print(movie_divs) #prints the results as list 

<class 'bs4.element.ResultSet'>
[<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt15398776"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt15398776/"> <img alt="Oppenheimer" class="loadlate" data-tconst="tt15398776" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BMDBmYTZjNjUtN2M1MS00MTQ2LTk2ODgtNzc2M2QyZGE5NTVjXkEyXkFqcGdeQXVyNzAwMjU2MTY@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/S/sash/4FyxwxECzL-U1J8.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt15398776/">Oppenheimer</a>
<span class="lister-item-year text-muted unbold">(2023)</span>
</h3>
<p class="text-muted">
<span class="certificate">R</span>
<span class="ghost">|</span>
<span class="runtime">180 min</span>
<span class="ghost">|</span>
<span class="g

In [9]:
# Tell the web scraper to loop through all the div containers in movie_divs
for c in movie_divs:
    name=c.h3.a.text #gets the name of the movie
    titles.append(name)
    year=c.h3.find('span',class_='lister-item-year').text #gets the year of the movie
    years.append(year)
    runtime=c.find('span',class_='runtime').text if c.p.find('span',class_='runtime') else '' #finds the movie length if not empty
    time.append(runtime)
    iratings=float(c.strong.text) #gets the ratings of the movie
    imdb_ratings.append(iratings)
    mscore=c.find('span',class_='metascore favorable').text if c.find('span',class_='metascore favorable') else '' #gets the metascore
    meta_scores.append(mscore)
    '''Since votes, ranks and gross has same tag and name we use the below code'''
    # Initialize boolean variables to keep track of found categories
    found_votes = False
    found_ranks = False
    found_us_gross = False

    nv = c.find_all('span', attrs={'name': 'nv'})  # gets the votes value

    for span in nv:
        t = span.text
        if '$' in t and not found_us_gross:
            us_gross.append(t)
            found_us_gross = True
        elif '#' in t and not found_ranks:
            ranks.append(t)
            found_ranks = True
        elif t.replace(',', '').isdigit() and not found_votes:
            votes.append(t)
            found_votes = True

    # If a category is not found, append an empty string to maintain the lists' consistency
    if not found_votes:
        votes.append('')
    if not found_ranks:
        ranks.append('')
    if not found_us_gross:
        us_gross.append('')

    

In [10]:
print(len(titles))
print(len(years))
print(len(time))
print(len(imdb_ratings))
print(len(meta_scores))
print(len(votes))
print(len(us_gross))
print(len(ranks))

50
50
50
50
50
50
50
50


In [11]:
'''Store the collected data as tables to display in text file'''
movies =pd.DataFrame({
    'movie_name':titles,
    'release_year':years,
    'movie_length':time,
    'imdb_rating':imdb_ratings,
    'metascore':meta_scores,
    'votes':votes,
    'US_gross(in millions)':us_gross,
    'rank(in top 250)':ranks
})

In [12]:
print(movies)

                                           movie_name release_year  \
0                                         Oppenheimer       (2023)   
1       Mission: Impossible - Dead Reckoning Part One       (2023)   
2                      Guardians of the Galaxy Vol. 3       (2023)   
3                 Spider-Man: Across the Spider-Verse       (2023)   
4                       Mission: Impossible - Fallout       (2018)   
5                                        Interstellar       (2014)   
6                                John Wick: Chapter 4       (2023)   
7                        Puss in Boots: The Last Wish       (2022)   
8                                     The Dark Knight       (2008)   
9                                           Inception       (2010)   
10                                            Titanic       (1997)   
11                                            Dunkirk       (2017)   
12                                  Top Gun: Maverick       (2022)   
13                  

## Data Cleaning

In [13]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   movie_name             50 non-null     object 
 1   release_year           50 non-null     object 
 2   movie_length           50 non-null     object 
 3   imdb_rating            50 non-null     float64
 4   metascore              50 non-null     object 
 5   votes                  50 non-null     object 
 6   US_gross(in millions)  50 non-null     object 
 7   rank(in top 250)       50 non-null     object 
dtypes: float64(1), object(7)
memory usage: 3.2+ KB


In [14]:
movies.head()

Unnamed: 0,movie_name,release_year,movie_length,imdb_rating,metascore,votes,US_gross(in millions),rank(in top 250)
0,Oppenheimer,(2023),180 min,8.7,88,192611,,#23
1,Mission: Impossible - Dead Reckoning Part One,(2023),163 min,8.0,81,97413,,
2,Guardians of the Galaxy Vol. 3,(2023),150 min,8.1,64,232592,,
3,Spider-Man: Across the Spider-Verse,(2023),140 min,8.9,86,193560,,#15
4,Mission: Impossible - Fallout,(2018),147 min,7.7,86,357039,$220.16M,


In [15]:
movies['release_year'] = movies['release_year'].str.extract('(\d+)').astype(int) #extracts only the year and makes its dtype as int

In [16]:
movies['movie_length'] = movies['movie_length'].str.extract('(\d+)').astype(int) #extracts only the time and makes its dtype as int

In [17]:
#movies['metascore'] = movies['metascore'].astype(int) #changes the dtype as int
movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')

In [18]:
movies['votes']=movies['votes'].str.replace(',','').astype(int)

In [19]:
#movies['US_gross(in millions)'] = movies['US_gross(in millions)'].str.extract('(\d+)').astype(float)
movies['US_gross(in millions)']= movies['US_gross(in millions)'].map(lambda x: x.lstrip('$').rstrip('M'))#calls anonymous func to remove $ and M
movies['US_gross(in millions)'] = pd.to_numeric(movies['US_gross(in millions)'], errors='coerce')

In [20]:
#movies['rank(in top 250)']=movies['rank(in top 250)'].str.extract('(\d+)').astype(int)
#movies['rank(in top 250)']=movies['rank(in top 250)'].str.replace('#','').astype(int)
#movies['rank(in top 250)'] = movies['rank(in top 250)'].astype(int)

movies['rank(in top 250)']= movies['rank(in top 250)'].map(lambda x: x.lstrip('#'))#calls anonymous func to remove '#'
movies['rank(in top 250)'] = pd.to_numeric(movies['rank(in top 250)'], errors='coerce')

In [21]:
print(movies.head())

                                      movie_name  release_year  movie_length  \
0                                    Oppenheimer          2023           180   
1  Mission: Impossible - Dead Reckoning Part One          2023           163   
2                 Guardians of the Galaxy Vol. 3          2023           150   
3            Spider-Man: Across the Spider-Verse          2023           140   
4                  Mission: Impossible - Fallout          2018           147   

   imdb_rating  metascore   votes  US_gross(in millions)  rank(in top 250)  
0          8.7       88.0  192611                    NaN              23.0  
1          8.0       81.0   97413                    NaN               NaN  
2          8.1       64.0  232592                    NaN               NaN  
3          8.9       86.0  193560                    NaN              15.0  
4          7.7       86.0  357039                 220.16               NaN  


In [22]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   movie_name             50 non-null     object 
 1   release_year           50 non-null     int32  
 2   movie_length           50 non-null     int32  
 3   imdb_rating            50 non-null     float64
 4   metascore              48 non-null     float64
 5   votes                  50 non-null     int32  
 6   US_gross(in millions)  44 non-null     float64
 7   rank(in top 250)       31 non-null     float64
dtypes: float64(4), int32(3), object(1)
memory usage: 2.7+ KB


In [23]:
print(movies)

                                           movie_name  release_year  \
0                                         Oppenheimer          2023   
1       Mission: Impossible - Dead Reckoning Part One          2023   
2                      Guardians of the Galaxy Vol. 3          2023   
3                 Spider-Man: Across the Spider-Verse          2023   
4                       Mission: Impossible - Fallout          2018   
5                                        Interstellar          2014   
6                                John Wick: Chapter 4          2023   
7                        Puss in Boots: The Last Wish          2022   
8                                     The Dark Knight          2008   
9                                           Inception          2010   
10                                            Titanic          1997   
11                                            Dunkirk          2017   
12                                  Top Gun: Maverick          2022   
13    

In [24]:
movies.to_csv('WebScraper/movies_data.csv') #downloads the files into current working directory