In [1]:
# Import necessary libraries
import requests  # For sending HTTP requests
from bs4 import BeautifulSoup  # For parsing HTML content
import pandas as pd  # For creating and manipulating DataFrames
import json  # For working with JSON data

# Step 1: Send a GET request to the IMDb "Most Popular Movies" page
url = "https://www.imdb.com/chart/moviemeter"  # URL of IMDb's most popular movies chart

# Define headers to mimic a browser visit and avoid being blocked
headers = {"User-Agent": "Mozilla/5.0"}

# Send GET request to fetch the content of the page
response = requests.get(url, headers=headers)

# Step 2: Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Step 3: Extract the JSON-LD structured data containing movie details
# IMDb stores movie data in a <script> tag of type 'application/ld+json'
script_tag = soup.find('script', type='application/ld+json')

# If the script tag containing JSON data is found
if script_tag:
    # Convert the JSON string to a Python dictionary
    data = json.loads(script_tag.string)

    # Step 4: Initialize lists to store the movie details
    titles = []
    urls = []
    descriptions = []
    best_ratings = []
    worst_ratings = []
    rating_values = []
    genres = []
    durations = []

    # Step 5: Extract movie details from the 'itemListElement' key in the JSON data
    if 'itemListElement' in data:
        for item in data['itemListElement']:
            movie = item.get('item', {})

            # Extract movie details, using 'N/A' if the field is missing
            titles.append(movie.get('name', 'N/A'))
            urls.append(movie.get('url', 'N/A'))
            descriptions.append(movie.get('description', 'N/A'))
            best_ratings.append(movie.get('aggregateRating', {}).get('bestRating', 'N/A'))
            worst_ratings.append(movie.get('aggregateRating', {}).get('worstRating', 'N/A'))
            rating_values.append(movie.get('aggregateRating', {}).get('ratingValue', 'N/A'))
            genres.append(movie.get('genre', 'N/A'))
            durations.append(movie.get('duration', 'N/A'))

    # Step 6: Create a pandas DataFrame with the extracted data
    df = pd.DataFrame({
        'Title': titles,
        'URL': urls,
        'Description': descriptions,
        'Best Rating': best_ratings,
        'Worst Rating': worst_ratings,
        'Rating Value': rating_values,
        'Genre': genres,
        'Duration': durations
    })

    # Step 7: Save the DataFrame to a CSV file
    df.to_csv('most_popular_movies.csv', index=False)  # Save the DataFrame as a CSV file
    print("Data saved to 'most_popular_movies.csv'")  # Notify the user

else:
    print("No JSON-LD data found on the page.")


Data saved to 'most_popular_movies.csv'


In [3]:
# Data Cleaning: Handle missing values and duplicates
df = df.drop_duplicates()  # Remove duplicate rows
df = df.fillna('N/A')  # Fill missing values with 'N/A' (or use another method like mean/median for numeric columns)

# Example: Remove rows where 'Title' or 'Rating Value' is missing
df = df[df['Title'] != 'N/A']
df = df[df['Rating Value'] != 'N/A']

# Optional: Convert 'Rating Value' to numeric if needed
df['Rating Value'] = pd.to_numeric(df['Rating Value'], errors='coerce')

print("Data cleaning complete.")


Data cleaning complete.


In [5]:
# Data Preprocessing: Example of encoding categorical data and scaling numerical data
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Example: Encoding 'Genre' column (if it's categorical)
encoder = LabelEncoder()
df['Genre'] = encoder.fit_transform(df['Genre'])

# Example: Scaling 'Rating Value' (numeric column)
scaler = StandardScaler()
df['Rating Value'] = scaler.fit_transform(df[['Rating Value']])

print("Data preprocessing complete.")


Data preprocessing complete.


In [7]:
# Display a sample of the cleaned and preprocessed data
print(df.head())  # Show the first 5 rows of the dataset

                Title                                     URL  \
0           Mickey 17  https://www.imdb.com/title/tt12299608/   
1               Anora  https://www.imdb.com/title/tt28607951/   
2  The Electric State   https://www.imdb.com/title/tt7766378/   
3           Black Bag  https://www.imdb.com/title/tt30988739/   
4             Heretic  https://www.imdb.com/title/tt28015403/   

                                         Description Best Rating Worst Rating  \
0  Mickey 17, known as an &quot;expendable,&quot;...          10            1   
1  A young escort from Brooklyn meets and impulsi...          10            1   
2  An orphaned teen hits the road with a mysterio...          10            1   
3  When intelligence agent Kathryn Woodhouse is s...          10            1   
4  Two young religious women are drawn into a gam...          10            1   

   Rating Value  Genre Duration  
0      0.258532     13  PT2H17M  
1      0.700888     27  PT2H19M  
2     -0.714650     

In [9]:
# Prepare the final dataset and save it as a cleaned version
df.to_csv('cleaned_most_popular_movies.csv', index=False)
print("Final dataset prepared and saved as 'cleaned_most_popular_movies.csv'.")


Final dataset prepared and saved as 'cleaned_most_popular_movies.csv'.


In [None]:
#document code with docstring
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

# Function to scrape and save movie data from IMDb
def scrape_imdb_data():
    """
    This function scrapes the IMDb Most Popular Movies page and extracts 
    movie details including title, rating, description, and more. It stores 
    the extracted data in a pandas DataFrame and saves it as a CSV file.
    """
    url = "https://www.imdb.com/chart/moviemeter"
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    script_tag = soup.find('script', type='application/ld+json')
    
    if script_tag:
        data = json.loads(script_tag.string)
        
        # Initialize lists to store data
        titles, urls, descriptions, best_ratings = [], [], [], []
        worst_ratings, rating_values, genres, durations = [], [], [], []
        
        # Extract movie details from the JSON data
        if 'itemListElement' in data:
            for item in data['itemListElement']:
                movie = item['item']
                titles.append(movie.get('name', 'N/A'))
                urls.append(movie.get('url', 'N/A'))
                descriptions.append(movie.get('description', 'N/A'))
                best_ratings.append(movie.get('aggregateRating', {}).get('bestRating', 'N/A'))
                worst_ratings.append(movie.get('aggregateRating', {}).get('worstRating', 'N/A'))
                rating_values.append(movie.get('aggregateRating', {}).get('ratingValue', 'N/A'))
                genres.append(movie.get('genre', 'N/A'))
                durations.append(movie.get('duration', 'N/A'))
        
        # Create a DataFrame
        df = pd.DataFrame({
            'Title': titles,
            'URL': urls,
            'Description': descriptions,
            'Best Rating': best_ratings,
            'Worst Rating': worst_ratings,
            'Rating Value': rating_values,
            'Genre': genres,
            'Duration': durations
        })
        
        # Save the DataFrame as a CSV file
        df.to_csv('most_popular_movies.csv', index=False)
        print("Data saved to 'most_popular_movies.csv'")
    
    else:
        print("No data found.")

# Call the function to scrape data and save it
scrape_imdb_data()
