In [32]:
#Importing all necessary libraries 

import requests                      #For making HTTP requests
from bs4 import BeautifulSoup        #For parsing HTML content
import pandas as pd                  #For creating, storing and manipulating dataframe
import json

In [33]:
#the URL
#Using headers to mimic a browser request
url = 'https://www.imdb.com/chart/top/'
headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win 64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [34]:
#sending request to fetch page content
response = requests.get(url, headers=headers)
response

<Response [200]>

In [35]:
#Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')

In [36]:
#Printing the parsed HTML to verify
print(soup.prettify()[:2000])

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <script>
   if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }
  </script>
  <script>
   window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1749786111343);
        }
    })
  </script>
  <title>
   IMDb Top 250 movies
  </title>
  <meta content="As rated by regular IMDb voters." data-id="main" name="description"/>
  <meta content="0cadf7898134e79b" name="google-site-verification"/>
  <meta content="C1DACEF2769068C0B0D2687C9

In [37]:
#Find the JSON data in the script tag

json_data = soup.find('script', type='application/ld+json')

In [38]:
#Parse the json data
if json_data:
    data = json.loads(json_data.string)

In [39]:
#Extract titles and ratings
titles = []
urls = []
descriptions = []
best_ratings = []
worst_ratings = []
ratings = []
genres = []
durations = []

In [40]:
#Check if the data contains the expected structure
if 'itemListElement' in data:
    for item in data['itemListElement']:
        movie = item['item']

        #Extract movie details
        titles.append(movie['name'])                 #Movie name
        urls.append(movie['url'])                    #Movie URL
        descriptions.append(movie['description'])    #Movie Description

        #Extract ratings, best, worst, actual ratings
        best_ratings.append(movie['aggregateRating']['bestRating'])      #Best Rating
        worst_ratings.append(movie['aggregateRating']['worstRating'])    #Worst Rating
        ratings.append(movie['aggregateRating']['ratingValue'])         #Rating value

        genres.append(movie['genre'])       #Movie genre
        durations.append(movie['duration']) #Movie duration
        

In [41]:
#Create DataFrame to store scrapped data

df = pd.DataFrame({
    'Title': titles,
    'URL': urls,
    'Description': descriptions,
    'Best Rating': best_ratings,
    'Worst Rating': worst_ratings,
    'Rating': ratings,
    'Genre': genres,
    'Duration': durations
})

df.head

<bound method NDFrame.head of                         Title                                    URL  \
0    The Shawshank Redemption  https://www.imdb.com/title/tt0111161/   
1               The Godfather  https://www.imdb.com/title/tt0068646/   
2             The Dark Knight  https://www.imdb.com/title/tt0468569/   
3       The Godfather Part II  https://www.imdb.com/title/tt0071562/   
4                12 Angry Men  https://www.imdb.com/title/tt0050083/   
..                        ...                                    ...   
245             Groundhog Day  https://www.imdb.com/title/tt0107048/   
246                  The Help  https://www.imdb.com/title/tt1454029/   
247               Höstsonaten  https://www.imdb.com/title/tt0077711/   
248             Amores perros  https://www.imdb.com/title/tt0245712/   
249        Gangs of Wasseypur  https://www.imdb.com/title/tt1954470/   

                                           Description  Best Rating  \
0    A banker convicted of uxorici

In [42]:
#Save the dataframe to a CSV file
df.to_csv('toIMDBmovies.csv', index=False) #Setting index = False excludes the row index from being written to the file
print("Data saved in csv format")

Data saved in csv format
