All Imports

In [2]:
import os
import string
import requests
import openpyxl
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

Extracting all the movie titles and links.

In [6]:
# Loading the current path

current_dir = os.getcwd()

with tqdm(total=8, desc = "processing", leave = True) as pbar:
    for iter in range(1,9):
        # Accessing the HTML text.
        imdb_link = f"https://www.imdb.com/list/ls000634294/?st_dt=&mode=detail&page={iter}&sort=list_order,asc"

        url = requests.get(imdb_link).text
        soup = BeautifulSoup(url, "html.parser")

        # Find the specific div element
        list_detail_div = soup.find("div", class_="lister-list")

        # Find all lister-item mode-detail elements
        movie_items = list_detail_div.find_all("div", class_="lister-item mode-detail")

        # Find the next page
        next_page = soup.find('a', class_ = 'flat-button lister-page-next next-page')

        # Fuction to extract movie titles and links
        def extract_TL():
            # Initialize empty lists to store titles and links
            titles = []
            links = []

            for item in movie_items:
                # Find the <a> tag containing the movie title
                title_tag = item.find("a", href=True)
                if title_tag:
                    # Extract the title from the alt attribute of the img tag
                    title = title_tag.find('img')['alt']
                    titles.append(title)

                    # Find the href of the title
                    href = title_tag['href']

                    # Check if the href starts with "/title/tt"
                    if href.startswith('/title/tt'):
                        # Combine the IMDb link
                        full_link = f"https://www.imdb.com{href}reviews?ref_=ttls_li_tt"

                        # Append the full link to the list
                        links.append(full_link)
            return titles, links

        titles, links = extract_TL()

        # Create a DataFrame
        df = pd.DataFrame({'Movie Title': titles, 'IMDb Link': links})
        df.to_excel(fr'{current_dir}\movielinks\file{iter}.xlsx', index=False)
        pbar.update(1)

processing: 100%|██████████| 8/8 [00:54<00:00,  6.87s/it]


Loading all the 'movie titles and links' as lists

In [7]:
Titles = []
Links = []

# Iterating through all the files containg movie titles and links

for count in range(1, 9):
    path = fr'{current_dir}\movielinks\file{count}.xlsx'
    try:
        file = openpyxl.load_workbook(path)
        for sheet in file:
            # Append the entire column to Titles
            Titles_column = [cell.value for cell in sheet['A'][1:]]
            Titles.append(Titles_column)
            
            # Append the entire column to Links
            Links_column = [cell.value for cell in sheet['B'][1:]]
            Links.append(Links_column)
    except FileNotFoundError:
        print(f"File not found: {path}")


Getting all the 'reviews'

In [8]:
# Function to sanitize the title string for use as a file name

def sanitize_title(title_str):
    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
    sanitized_title = ''.join(c for c in title_str if c in valid_chars)
    return sanitized_title

directory = fr"{current_dir}\movielinks\review links"

# Flatten the Links list
flattened_links = [link for sublist in Links for link in sublist]

with tqdm(total=len(flattened_links), desc="processing") as pbar2:
    for link in flattened_links:
        title_str = str()
        review_links = []

        headers = {'User-Agent': 'MyIMDbScraper/1.0 (https://github.com/yourusername/my-imdb-scraper)'}
        page = requests.get(link, headers=headers).text
        soup = BeautifulSoup(page, 'html.parser')

        lister = soup.find('div', class_='lister-list')
        lister_items = lister.find_all('div', class_='lister-item-content')
        sub_page = soup.find('div', class_='subpage_title_block')

        # Extract movie title
        title_tag = sub_page.find("h3", itemprop="name")
        if title_tag:
            # Extract the text content of the <a> tag inside the <h3> tag
            title_str = title_tag.find('a').text
            # Sanitize the title string for use as a file name
            title_str = sanitize_title(title_str)

        for item in lister_items:
            a = item.find('a', class_='title')
            if a:
                href = a['href']
                review_links.append(f'https://www.imdb.com{href}')
        rdf = pd.DataFrame(review_links)

        rdf.to_excel(fr'{directory}\{title_str}.xlsx')
        pbar2.update(1)



processing: 100%|██████████| 787/787 [23:52<00:00,  1.82s/it]
