# Crawl IMdB Website for TOP Grossing Movies and their info from each year

In [None]:
%pip install selenium
%pip install bs4

In [28]:
import os
import re
import time
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [29]:
def safe_extract(soup_obj, selector, attribute=None, processing=None):
    try:
        if isinstance(soup_obj, (int, str)) or soup_obj is None:
            return None
        element = soup_obj.select_one(selector) if isinstance(selector, str) else soup_obj.find(*selector)
        if element:
            text = element.get(attribute) if attribute else element.text
            return processing(text) if processing else text
    except Exception as e:
        return None
    return None

In [None]:
def extract_box_office_data(soup, selector_id):
    try:
        section = soup.find("div", {"data-testid": selector_id})
        if section:
            value = section.select_one("div.ipc-metadata-list-item__content-container span")
            return value.text.strip() if value else None
    except:
        return None
    return None

In [None]:
def extract_credits(soup, credit_type):
    try:
        credits_section = soup.find("div", {"data-testid": "title-pc-wide-screen"})
        if credits_section:
            credit_div = credits_section.find("li", {"data-testid": f"title-pc-principal-credit-{credit_type}"})
            if credit_div:
                return [name.text.strip() for name in credit_div.find_all("a")]
    except:
        return None
    return None

In [30]:
def crawl_imdb_movies(year: int):
    """
    Crawl IMDb movie data for a specific year

    Args:
        year (int): Year to crawl movie data for
    """
    # Create output directories
    output_dir = os.path.join("Data", str(year))
    os.makedirs(output_dir, exist_ok=True)

    # Construct URL with dynamic year
    url = f"https://www.imdb.com/search/title/?title_type=feature&release_date={year}-01-01,{year}-12-31&count=100&sort=boxoffice_gross_us,desc"
    # url = f"https://www.imdb.com/search/title/?title_type=feature&release_date={year}-01-01,{year}-12-31&count=10&sort=boxoffice_gross_us,desc"

    # Configure Edge options
    options = Options()
    options.add_argument("--lang=en-US")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-browser-side-navigation")
    options.add_argument("--disable-infobars")
    options.add_argument("--disable-blink-features=AutomationControlled")

    # Setup WebDriver
    driver_path = "edgedriver.exe"
    service = Service(executable_path=driver_path)
    driver = webdriver.Edge(service=service, options=options)

    try:
        # Navigate to page
        driver.get(url)
        time.sleep(2)  # Initial page load wait

        # Load more movies
        loaded_data = 100
        # loaded_data = 10
        while loaded_data != 600:
            try:
                # Locate and click "Load More" button
                load_more_button = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located(
                        (
                            By.XPATH,
                            "//button[contains(@class, 'ipc-btn') and .//span[contains(text(), '100 more')]]",
                        )
                    )
                )

                # Scroll and click button
                driver.execute_script(
                    "arguments[0].scrollIntoView(true);", load_more_button
                )
                driver.execute_script("arguments[0].click();", load_more_button)

                time.sleep(3)  # Wait for new data
                loaded_data += 100

            except Exception as e:
                print(f"No more 'Load More' button or error: {e}")
                break

        # Parse page content
        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")

        films = soup.find(
            "ul",
            class_="ipc-metadata-list ipc-metadata-list--dividers-between sc-748571c8-0 gFCVNT detailed-list-view ipc-metadata-list--base",
        )
        films_data = []

        # Extract movie metadata
        for film in films.find_all("li", class_="ipc-metadata-list-summary-item"):
            title = safe_extract(film, ["h3", {"class": "ipc-title__text"}])

            metadata_div = film.find(
                "div", class_="sc-300a8231-6 dBUjvq dli-title-metadata"
            )
            spans = metadata_div.find_all("span") if metadata_div else []

            year_text = spans[0].text if len(spans) > 0 else None
            duration = spans[1].text if len(spans) > 1 else None
            mpa = spans[2].text if len(spans) > 2 else None

            rating = safe_extract(film, ["span", {"class": "ipc-rating-star--rating"}])

            link_tag = film.find("a", class_="ipc-lockup-overlay ipc-focusable")
            movie_link = f"https://www.imdb.com{link_tag['href']}" if link_tag else None

            vote_count = safe_extract(
                film,
                ["span", {"class": "ipc-rating-star--voteCount"}],
                processing=lambda x: x.strip().replace("\xa0", "")[1:-1],
            )

            meta_score = safe_extract(film, ["span", {"class": "metacritic-score-box"}])

            description = safe_extract(
                film,
                ["div", {"class": "ipc-html-content-inner-div"}],
                processing=lambda x: x.strip(),
            )

            films_data.append(
                {
                    "Title": title,
                    "Year": year_text,
                    "Duration": duration,
                    "MPA": mpa,
                    "Rating": rating,
                    "Votes": vote_count,
                    "meta_score": meta_score,
                    "description": description,
                    "Movie Link": movie_link,
                }
            )

        # Save initial movies data
        initial_movies_df = pd.DataFrame(films_data)
        initial_movies_path = os.path.join(output_dir, f"imdb_movies_{year}.csv")
        initial_movies_df.to_csv(initial_movies_path, index=False)

        # Advanced movie details collection
        all_movie_data = []
        for url in list(initial_movies_df["Movie Link"]):
            try:
                driver.get(url)
                time.sleep(2)
                html = driver.page_source
                soup = BeautifulSoup(html, "html.parser")

                advanced_details = {
                    "link": url,
                    # Box Office Data
                    "budget": safe_extract(
                        soup.find("section", {"data-testid": "BoxOffice"}),
                        ["div", {"class": "ipc-metadata-list-item__content-container"}],
                        processing=lambda x: x.find("span").text.strip(),
                    ),
                    "grossWorldWide": safe_extract(
                        soup.find("div", {"data-testid": "title-boxoffice-cumulative"}),
                        [
                            "span",
                            {"class": "ipc-metadata-list-item__list-content-item"},
                        ],
                        processing=lambda x: x.text.strip(),
                    ),
                    "gross_US_Canada": safe_extract(
                        soup.find(
                            "div", {"data-testid": "title-boxoffice-grossdomestic"}
                        ),
                        [
                            "span",
                            {"class": "ipc-metadata-list-item__list-content-item"},
                        ],
                        processing=lambda x: x.text.strip(),
                    ),
                    # Credits
                    "writers": safe_extract(
                        soup.find("div", {"data-testid": "title-pc-principal-credits"}),
                        ["li", {"data-testid": "title-pc-writer"}],
                        processing=lambda x: [
                            writer.text.strip() for writer in x.find_all("a")
                        ],
                    ),
                    "directors": safe_extract(
                        soup.find("div", {"data-testid": "title-pc-principal-credits"}),
                        ["li", {"data-testid": "title-pc-director"}],
                        processing=lambda x: [
                            director.text.strip() for director in x.find_all("a")
                        ],
                    ),
                    "stars": safe_extract(
                        soup.find("div", {"data-testid": "title-pc-principal-credits"}),
                        ["li", {"data-testid": "title-pc-actors"}],
                        processing=lambda x: [
                            actor.text.strip() for actor in x.find_all("a")
                        ],
                    ),
                    # Metadata
                    "genres": safe_extract(
                        soup.find("div", {"data-testid": "genres"}),
                        ["div", {"class": "ipc-chip-list"}],
                        processing=lambda x: [
                            genre.text.strip()
                            for genre in x.find_all("span", class_="ipc-chip__text")
                        ],
                    ),
                    "Languages": safe_extract(
                        soup.find("section", {"data-testid": "Details"}),
                        ["div", {"data-testid": "title-details-languages"}],
                        processing=lambda x: [
                            lang.text.strip() for lang in x.find_all("a")
                        ],
                    ),
                    "countries_origin": safe_extract(
                        soup.find("section", {"data-testid": "Details"}),
                        ["div", {"data-testid": "title-details-origin"}],
                        processing=lambda x: [
                            country.text.strip() for country in x.find_all("a")
                        ],
                    ),
                    "production_companies": safe_extract(
                        soup.find("section", {"data-testid": "Details"}),
                        ["div", {"data-testid": "title-details-companies"}],
                        processing=lambda x: [
                            company.text.strip() for company in x.find_all("a")
                        ],
                    ),
                    "filming_locations": safe_extract(
                        soup.find("section", {"data-testid": "Locations"}),
                        ["div", {"class": "ipc-metadata-list-item__content-container"}],
                        processing=lambda x: [
                            loc.text.strip() for loc in x.find_all("a")
                        ],
                    ),
                    # Awards
                    "awards": safe_extract(
                        soup.find("div", {"data-testid": "awards"}),
                        ["div", {"class": "ipc-metadata-list-item__content-container"}],
                        processing=lambda x: {
                            "wins": (
                                int(re.search(r"(\d+)\s*wins?", x.text, re.I).group(1))
                                if re.search(r"(\d+)\s*wins?", x.text, re.I)
                                else 0
                            ),
                            "nominations": (
                                int(
                                    re.search(
                                        r"(\d+)\s*nominations?", x.text, re.I
                                    ).group(1)
                                )
                                if re.search(r"(\d+)\s*nominations?", x.text, re.I)
                                else 0
                            ),
                            "oscars": (
                                int(
                                    re.search(
                                        r"Won\s*(\d+)\s*Oscars?", x.text, re.I
                                    ).group(1)
                                )
                                if re.search(r"Won\s*(\d+)\s*Oscars?", x.text, re.I)
                                else 0
                            ),
                        },
                    ),
                    "release_date": safe_extract(
                        soup.find("section", {"data-testid": "Details"}),
                        ["div", {"data-testid": "title-details-release-date"}],
                        processing=lambda x: x.find("a").text.split(" (")[0].strip(),
                    ),
                }
                all_movie_data.append(advanced_details)

            except Exception as e:
                print(f"Error processing {url}: {e}")

        # Save advanced movie details
        advanced_movies_df = pd.DataFrame(all_movie_data)
        advanced_movies_path = os.path.join(
            output_dir, f"advanced_movies_details_{year}.csv"
        )
        advanced_movies_df.to_csv(advanced_movies_path, index=False)

        # Merge datasets
        advanced_movies_df.rename(columns={"link": "Movie Link"}, inplace=True)
        merged_data = pd.merge(
            initial_movies_df, advanced_movies_df, how="inner", on="Movie Link"
        )
        merged_path = os.path.join(output_dir, f"merged_movies_data_{year}.csv")
        merged_data.to_csv(merged_path, index=False)

        return merged_data

    except Exception as e:
        print(f"An error occurred: {e}")

    finally:
        driver.quit()

In [None]:
# years_to_crawl = range(1966, 2014)
# for year in years_to_crawl:
#     print(f"Crawling data for year {year}")
#     crawl_imdb_movies(year)
# years_to_crawl = range(2019, 2024)
# for year in years_to_crawl:
#     print(f"Crawling data for year {year}")
#     crawl_imdb_movies(year)

years_to_crawl = range(2014, 2019)
for year in years_to_crawl:
    print(f"Crawling data for year {year}")
    crawl_imdb_movies(year)