In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
BASE_URL = "https://www.imdb.com"

In [3]:
headers = {"Accept-Language": "en-US,en;q=0.8", "User-Agent": "Mozilla/5.0"}


In [24]:
url = f"{BASE_URL}/chart/top/?ref_=nv_mv_250"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
rows = soup.select("li.ipc-metadata-list-summary-item")
data = []
for row in rows:
    title = row.select_one("h3.ipc-title__text").get_text(strip=True).split(". ")[-1]
    link = row.select_one("a.ipc-title-link-wrapper")["href"].split("?")[0]
    url = BASE_URL + link
    meta = row.select("span.dli-title-metadata-item")
    # year = meta[0].get_text(strip=True)
    # runtime = meta[1].get_text(strip=True)
    # rating = row.select_one("span.ipc-rating-star--rating").get_text(strip=True)
    data.append({
        "title": title,
        "url": url,
        # "year": year,
        # "runtime": runtime,
        # "rating": rating
    })

In [25]:
len(data)

25

In [89]:
import json
import requests
from bs4 import BeautifulSoup


BASE_URL = "https://www.imdb.com"
HEADERS = {
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) Chrome/126.0.0.0 Safari/537.36',
    'accept': 'text/html',
}


def get_movies_with_some_details() -> list:
    url = f"{BASE_URL}/chart/top/?ref_=nv_mv_250"
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    script = soup.select_one("script[type='application/ld+json']")
    content = json.loads(script.text)
    data = []
    for entry in content["itemListElement"]:
        data.append({
            "title": entry["item"]["name"],
            "url": entry["item"]["url"],
            "duration": entry["item"]["duration"][2:],
            "rating": entry["item"]["aggregateRating"]["ratingValue"]
        })
    
    return data

def get_further_details(url: str) -> list:
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    script = soup.select_one("script[type='application/ld+json']")
    content = json.loads(script.text)
    # Get information
    genres = content["genre"]
    timestamp = content["datePublished"] if "datePublished" in content else ""
    directors = [v["name"] for v in content["director"] if v["@type"] == "Person"]
    stars = [v["name"] for v in content["actor"]]
   
    return genres, timestamp, directors, stars


def get_synopsis(url: str) -> str:
    response = requests.get(url + "plotsummary/", headers=HEADERS)
    soup = BeautifulSoup(response.text, "html.parser")
    synopsis = soup.select("div.ipc-html-content-inner-div")[-1].get_text(strip=True)
    
    return synopsis


data = get_movies_with_some_details()
for index, movie in enumerate(data):
    url = movie["url"]
    synopsis = get_synopsis(url=url)
    genres, timestamp, directors, stars = get_further_details(url=url)
    data[index]["genres"] = genres
    data[index]["date"] = timestamp
    data[index]["directors"] = directors
    data[index]["stars"] = stars
    data[index]["synopsis"] = synopsis

with open("imdb_top250_movies.json", "w") as f:
    json.dump(data, f)


In [90]:
movie

{'title': 'Groundhog Day',
 'url': 'https://www.imdb.com/title/tt0107048/',
 'duration': '1H41M',
 'rating': 8,
 'genres': ['Comedy', 'Drama', 'Fantasy'],
 'date': '1993-04-29',
 'directors': ['Harold Ramis'],
 'stars': ['Bill Murray', 'Andie MacDowell', 'Chris Elliott'],
 'synopsis': 'On February 1, self-centered and sour TV meteorologist Phil Connors (Bill Murray), news producer Rita Hanson (Andie MacDowell) and cameraman Larry (Chris Elliott) from fictional Pittsburgh television station WPBH-TV9 travel to Punxsutawney, Pennsylvania, to cover the annual Groundhog Day festivities with Punxsutawney Phil, the Groundhog. Having grown tired of this assignment, Phil begrudgingly gives his Groundhog Day report the next day (February 2) during the festival and parade.After the celebration concludes, a blizzard develops that Connors had predicted would miss them, closing the roads and shutting down long-distance phone services, forcing the team to return to Punxsutawney. However, Connors awak

In [None]:
def get_movies_with_some_details() -> list:
    data = []
    for start in range(1, 250, 25):
        url = f"{BASE_URL}/search/title/?groups=top_250&sort=user_rating&count=25&start={start}"
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")
        rows = soup.select("li.ipc-metadata-list-summary-item")
        for row in rows:
            title = row.select_one("h3.ipc-title__text").get_text(strip=True).split(". ")[-1]
            link = row.select_one("a.ipc-title-link-wrapper")["href"].split("?")[0]
            url = BASE_URL + link
            meta = row.select("span.dli-title-metadata-item")
            year = meta[0].get_text(strip=True)
            runtime = meta[1].get_text(strip=True)
            rating = row.select_one("span.ipc-rating-star--rating").get_text(strip=True)
            data.append({
                "title": title,
                "url": url,
                "year": year,
                "runtime": runtime,
                "rating": rating
            })
    return data


# def get_movies_with_some_details():
#     data = []
#     for start in range(1, 251, 50):  
#         url = f"{BASE_URL}/search/title/?groups=top_250&sort=user_rating&count=50&start={start}"
#         response = requests.get(url, headers=headers)
#         soup = BeautifulSoup(response.text, "html.parser")

#         for row in soup.select("div.lister-item-content"):
#             title_tag = row.select_one("h3.lister-item-header a")
#             title = title_tag.get_text(strip=True)
#             movie_url = BASE_URL + title_tag["href"].split("?")[0]

#             year = row.select_one("span.lister-item-year")
#             year = year.get_text(strip=True) if year else None

#             rating = row.select_one("div.inline-block.ratings-imdb-rating strong")
#             rating = rating.get_text(strip=True) if rating else None

#             data.append({
#                 "title": title,
#                 "url": movie_url,
#                 "year": year,
#                 "rating": rating
#             })
#     return data

In [None]:
data = get_movies_with_some_details()

In [16]:
len(data)

0

In [17]:
len(set([v["title"] for v in data]))

0

In [None]:
def get_further_details(url: str) -> list:
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Get genres
    genres_section = soup.select("div.ipc-chip-list--nowrap span.ipc-chip__text")
    genres = []
    for genre in genres_section:
        genres.append(genre.get_text(strip=True))
    
    directors, stars = [], []
    # Get director
    section1 = soup.select("li.ipc-metadata-list__item--align-end")[0]
    for director in section1.select("a"):
        directors.append(director.get_text(strip=True))
    # Get stars
    section2 = soup.select("li.ipc-metadata-list__item--align-end")[2]
    for star in section2.select("a"):
        star = star.get_text(strip=True)
        if "Stars" not in star and star != "":
            stars.append(star)

    return genres, directors, stars


def get_synopsis(url: str) -> str:
    response = requests.get(url + "plotsummary/", headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    synopsis = soup.select("div.ipc-html-content-inner-div")[-1].get_text(strip=True)
    
    return synopsis


In [None]:
for index, movie in enumerate(data):
    url = movie["url"]
    synopsis = get_synopsis(url=url)
    genres, directors, stars = get_further_details(url=url)
    data[index]["genres"] = genres
    data[index]["directors"] = directors
    data[index]["stars"] = stars
    data[index]["synopsis"] = synopsis

In [257]:
import json

with open("data.json", "w") as f:
    json.dump(data, f)

In [264]:
import pandas as pd

pd.DataFrame(data).title.nunique()

25