In [1]:
# Import the libraries
import requests # For making HTTP requests
from bs4 import BeautifulSoup # For parsing HTML content
import pandas as pd # For creating, storing and manipulating DataFrames



In [13]:
#send a request to IMDB
# Define the URL of the IMDb Top 250 movies page
url = 'https://www.imdb.com/chart/top/'

# Send an HTTP GET request to fetch the page content
response = requests.get(url)
response 

# Check if the request was successful (status code 200)
if response.status_code == 200: # Condition to check .....
    print("Successfully fetched the page!")

else:
    print(f"Failed to retrieve the page. Status code:{response.status_code}")


# Sometimes causes errors with some websites, so let's use headers
# Set headers to mimic a browser request
""" 
The User-Agent string in the headers simulates a request from a web browser. This often
helps bypass simple bot detection mechanisms that websites may have in place.
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 \
Safari/537.36'
}

# Send an HTTP GET request with the headers
response = requests.get(url, headers=headers)
response

# Check if the request was successful (status code 200)
if response.status_code == 200:
    print("Successfully fetched the page!")
else:
    print(f"Failed to retrieve the page. Status code:{response.status_code}")


Failed to retrieve the page. Status code:403
Successfully fetched the page!


In [15]:
#parse the html content

# Parse the HTML content of the page using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser') #soup
# Print the parsed HTML to verify
print(soup.prettify()[:2000]) # method in Beautiful Soup that formats


<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <script>
   if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }
  </script>
  <script>
   window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1742338645291);
        }
    })
  </script>
  <title>
   IMDb Top 250 Movies
  </title>
  <meta content="As rated by regular IMDb voters." data-id="main" name="description"/>
  <meta content="max-image-preview:large" name="robots"/>
  <script type="application/ld+json">
   {"@type":"It

In [43]:

# Find the JSON data in the script tag
import json
json_data = soup.find('script', type='application/ld+json')
if json_data:
# Parse the JSON data
    data = json.loads(json_data.string)

data # observe data
#extract movie data
# Create empty lists to store movie titles, years, ratings, and URLs
titles = []
years = []
ratings = []
urls = []
descriptions = []
best_ratings = []
worst_ratings = []
genres = []
durations = []
# Find all the movie items in the HTML
movie_items = soup.find_all('item')
movie_items
[]

[]

In [45]:
# Check if the data contains the expected structure
if 'itemListElement' in data:
    for item in data['itemListElement']:
        movie = item['item']
# Extract movie details
        titles.append(movie['name']) # Movie name
        urls.append(movie['url']) # Movie URL
        descriptions.append(movie['description']) # Movie description
# Extract ratings (best, worst, actual rating)
        best_ratings.append(movie['aggregateRating']['bestRating'])  #Best rating
        worst_ratings.append(movie['aggregateRating']['worstRating']) # Worst rating
        ratings.append(float(movie['aggregateRating']['ratingValue'])) # Rating value
        genres.append(movie['genre']) # Movie genre
        durations.append(movie['duration']) # Movie duration

In [49]:
#create a dataframe
import pandas as pd
# Create a DataFrame to store the scraped data
df = pd.DataFrame({
'Title': titles,
'URL': urls,
'Description': descriptions,
'Best Rating': best_ratings,
'Worst Rating': worst_ratings,
'Rating': ratings,
'Genre': genres,
'Duration': durations
})
df.head()

Unnamed: 0,Title,URL,Description,Best Rating,Worst Rating,Rating,Genre,Duration
0,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,A banker convicted of uxoricide forms a friend...,10,1,9.3,Drama,PT2H22M
1,The Godfather,https://www.imdb.com/title/tt0068646/,The aging patriarch of an organized crime dyna...,10,1,9.2,"Crime, Drama",PT2H55M
2,The Dark Knight,https://www.imdb.com/title/tt0468569/,When a menace known as the Joker wreaks havoc ...,10,1,9.0,"Action, Crime, Drama",PT2H32M
3,The Godfather Part II,https://www.imdb.com/title/tt0071562/,The early life and career of Vito Corleone in ...,10,1,9.0,"Crime, Drama",PT3H22M
4,12 Angry Men,https://www.imdb.com/title/tt0050083/,The jury in a New York City murder trial is fr...,10,1,9.0,"Crime, Drama",PT1H36M


In [53]:
#save data to csv file
# Save the DataFrame to a CSV file
df.to_csv('topIMBDmovies.csv', index=False) # Setting index=False excludes the row index from being written to the file.
print("Data saved in CSV format'")

Data saved in CSV format'
