# 1. Generate Movie Table

In [1]:
import requests
import re
from bs4 import BeautifulSoup
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

In [2]:
genres = []

In [3]:
def fetch_movie_page(imdb_url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
    }
    response = requests.get(imdb_url, headers=headers)
    if response.ok:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        return None

In [4]:
def get_top_250():
    imdb_url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"
    top_250_page = fetch_movie_page(imdb_url)
    movies = top_250_page.find_all('li', class_='ipc-metadata-list-summary-item sc-10233bc-0 iherUv cli-parent')
    movie_details = []
    for movie in movies:
        title_element = movie.find('h3', class_='ipc-title__text')
        if title_element:
            movie_name = title_element.text.strip()
            cleaned_text = re.sub(r'^\d+\.\s*', '', movie_name)
            link_element = movie.find('a', class_='ipc-title-link-wrapper')
            movie_link = f"https://www.imdb.com{link_element['href']}" if link_element else "No link found"
            movie_details.append((cleaned_text, movie_link))
    return movie_details

top_250 = get_top_250()
print(top_250)

[('The Shawshank Redemption', 'https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1'), ('The Godfather', 'https://www.imdb.com/title/tt0068646/?ref_=chttp_t_2'), ('The Dark Knight', 'https://www.imdb.com/title/tt0468569/?ref_=chttp_t_3'), ('The Godfather Part II', 'https://www.imdb.com/title/tt0071562/?ref_=chttp_t_4'), ('12 Angry Men', 'https://www.imdb.com/title/tt0050083/?ref_=chttp_t_5'), ("Schindler's List", 'https://www.imdb.com/title/tt0108052/?ref_=chttp_t_6'), ('The Lord of the Rings: The Return of the King', 'https://www.imdb.com/title/tt0167260/?ref_=chttp_t_7'), ('Pulp Fiction', 'https://www.imdb.com/title/tt0110912/?ref_=chttp_t_8'), ('The Lord of the Rings: The Fellowship of the Ring', 'https://www.imdb.com/title/tt0120737/?ref_=chttp_t_9'), ('The Good, the Bad and the Ugly', 'https://www.imdb.com/title/tt0060196/?ref_=chttp_t_10'), ('Forrest Gump', 'https://www.imdb.com/title/tt0109830/?ref_=chttp_t_11'), ('The Lord of the Rings: The Two Towers', 'https://www.imdb.com/ti

In [5]:
def budget_scale(amount):
    if amount < 1000000:
        return "under 1 million USD"
    elif amount >= 1000000 and amount < 10000000:
        return "1-10 million USD"
    elif amount >= 10000000 and amount < 25000000:
        return "10-25 million USD"
    elif amount >= 25000000 and amount < 50000000:
        return "25-50 million USD"
    else:
        return "50 million USD or more"

In [6]:
def get_budget_and_genre(top_250):
    movies = []
    for title, imdb_url in top_250:
        movie_page = fetch_movie_page(imdb_url)
        budget = None
        if movie_page:
            budget_item = movie_page.find('li', {'data-testid': 'title-boxoffice-budget'})
            if budget_item:
                budget_content = budget_item.find('span', {'class': 'ipc-metadata-list-item__list-content-item'})
                if budget_content:
                    amount = budget_content.text.strip()
                    number = re.findall(r'\d+[\d,\.]*', amount)
                    if number:
                        budget = int(''.join(number).replace(',', ''))
                        budget = budget_scale(budget)
            scroller = movie_page.find('div', class_='ipc-chip-list__scroller')
            genres = []
            if scroller:
                genre_spans = scroller.find_all('span', class_='ipc-chip__text')
                genres = [span.text for span in genre_spans]
            
            video_tag = movie_page.find('video', {'class': ['jw-video', 'jw-reset']})
            video_url = video_tag['src'] if video_tag else 'No video found'

            movies.append({
                'title': title,
                'imdb_url': imdb_url,
                'budget': budget,
                'genres': genres,
                'trailer_url': video_url
            })      
    return movies
movies = get_budget_and_genre(top_250)
print(movies)

[{'title': 'The Shawshank Redemption', 'imdb_url': 'https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1', 'budget': '25-50 million USD', 'genres': ['Drama'], 'trailer_url': 'No video found'}, {'title': 'The Godfather', 'imdb_url': 'https://www.imdb.com/title/tt0068646/?ref_=chttp_t_2', 'budget': '1-10 million USD', 'genres': ['Crime', 'Drama'], 'trailer_url': 'No video found'}, {'title': 'The Dark Knight', 'imdb_url': 'https://www.imdb.com/title/tt0468569/?ref_=chttp_t_3', 'budget': '50 million USD or more', 'genres': ['Action', 'Crime', 'Drama'], 'trailer_url': 'No video found'}, {'title': 'The Godfather Part II', 'imdb_url': 'https://www.imdb.com/title/tt0071562/?ref_=chttp_t_4', 'budget': '10-25 million USD', 'genres': ['Crime', 'Drama'], 'trailer_url': 'No video found'}, {'title': '12 Angry Men', 'imdb_url': 'https://www.imdb.com/title/tt0050083/?ref_=chttp_t_5', 'budget': 'under 1 million USD', 'genres': ['Crime', 'Drama'], 'trailer_url': 'No video found'}, {'title': "Schindler's

In [7]:
def write_movies_to_csv(movies, filename='movies.csv'):
    fieldnames = ['title', 'imdb_url', 'budget', 'genres', 'trailer_url']
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for movie in movies:
            writer.writerow(movie)
write_movies_to_csv(movies)

# 2. Generate Shots

In [8]:
import cv2

In [9]:
def gen_shorts(video_url):
    cap = cv2.VideoCapture(video_url)
    if not cap.isOpened():
        print("Error opening video stream")

    prev_frame = None

    while True:
        ret, cur_frame = cap.read()
        if not ret:
            break
        if prev_frame is not None:
            # difference = frame_diff(prev_frame, cur_frame)
            # if difference > 1000000:
            #     print("Possible scene change detected")
        prev_frame = cur_frame

    cap.release()

IndentationError: expected an indented block after 'if' statement on line 12 (3466557454.py, line 16)

In [11]:
tf = fetch_movie_page('https://www.imdb.com/title/tt0068646/videogallery/')
print(tf.find('a', class_='ipc-chip-list__scroller'))

None
