# 1. Generate Movie Table

In [6]:
import requests
import re
from bs4 import BeautifulSoup
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

In [7]:
genres = []

In [8]:
def fetch_movie_page(imdb_url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
    }
    response = requests.get(imdb_url, headers=headers)
    if response.ok:
        return BeautifulSoup(response.text, 'html.parser')
    else:
        return None

In [9]:
def get_top_250():
    imdb_url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"
    top_250_page = fetch_movie_page(imdb_url)
    movies = top_250_page.find_all('li', class_='ipc-metadata-list-summary-item sc-10233bc-0 iherUv cli-parent')
    movie_details = []
    for movie in movies:
        title_element = movie.find('h3', class_='ipc-title__text')
        if title_element:
            movie_name = title_element.text.strip()
            cleaned_text = re.sub(r'^\d+\.\s*', '', movie_name)
            link_element = movie.find('a', class_='ipc-title-link-wrapper')
            movie_link = f"https://www.imdb.com{link_element['href']}" if link_element else "No link found"
            movie_details.append((cleaned_text, movie_link))
    return movie_details

top_250 = get_top_250()
print(top_250)

[('The Shawshank Redemption', 'https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1'), ('The Godfather', 'https://www.imdb.com/title/tt0068646/?ref_=chttp_t_2'), ('The Dark Knight', 'https://www.imdb.com/title/tt0468569/?ref_=chttp_t_3'), ('The Godfather Part II', 'https://www.imdb.com/title/tt0071562/?ref_=chttp_t_4'), ('12 Angry Men', 'https://www.imdb.com/title/tt0050083/?ref_=chttp_t_5'), ("Schindler's List", 'https://www.imdb.com/title/tt0108052/?ref_=chttp_t_6'), ('The Lord of the Rings: The Return of the King', 'https://www.imdb.com/title/tt0167260/?ref_=chttp_t_7'), ('Pulp Fiction', 'https://www.imdb.com/title/tt0110912/?ref_=chttp_t_8'), ('The Lord of the Rings: The Fellowship of the Ring', 'https://www.imdb.com/title/tt0120737/?ref_=chttp_t_9'), ('The Good, the Bad and the Ugly', 'https://www.imdb.com/title/tt0060196/?ref_=chttp_t_10'), ('Forrest Gump', 'https://www.imdb.com/title/tt0109830/?ref_=chttp_t_11'), ('The Lord of the Rings: The Two Towers', 'https://www.imdb.com/ti

In [10]:
def budget_scale(amount):
    if amount < 1000000:
        return "under 1 million USD"
    elif amount >= 1000000 and amount < 10000000:
        return "1-10 million USD"
    elif amount >= 10000000 and amount < 25000000:
        return "10-25 million USD"
    elif amount >= 25000000 and amount < 50000000:
        return "25-50 million USD"
    else:
        return "50 million USD or more"

In [14]:
def get_budget_and_genre(top_250):
    movies = []
    for title, imdb_url in top_250:
        movie_page = fetch_movie_page(imdb_url)
        budget = None
        if movie_page:
            budget_item = movie_page.find('li', {'data-testid': 'title-boxoffice-budget'})
            if budget_item:
                budget_content = budget_item.find('span', {'class': 'ipc-metadata-list-item__list-content-item'})
                if budget_content:
                    amount = budget_content.text.strip()
                    number = re.findall(r'\d+[\d,\.]*', amount)
                    if number:
                        budget = int(''.join(number).replace(',', ''))
                        budget = budget_scale(budget)
            scroller = movie_page.find('div', class_='ipc-chip-list__scroller')
            genres = []
            if scroller:
                genre_spans = scroller.find_all('span', class_='ipc-chip__text')
                genres = [span.text for span in genre_spans]
            
            video_tag = movie_page.find('video', {'class': ['jw-video', 'jw-reset']})
            video_url = video_tag['src'] if video_tag else 'No video found'

            movies.append({
                'title': title,
                'imdb_url': imdb_url,
                'budget': budget,
                'genres': genres,
                'trailer_url': video_url
            })      
    return movies
movies = get_budget_and_genre(top_250)
for movie in movies:
    print(movie)

{'title': 'The Shawshank Redemption', 'imdb_url': 'https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1', 'budget': '25-50 million USD', 'genres': ['Drama'], 'trailer_url': 'No video found'}
{'title': 'The Godfather', 'imdb_url': 'https://www.imdb.com/title/tt0068646/?ref_=chttp_t_2', 'budget': '1-10 million USD', 'genres': ['Crime', 'Drama'], 'trailer_url': 'No video found'}
{'title': 'The Dark Knight', 'imdb_url': 'https://www.imdb.com/title/tt0468569/?ref_=chttp_t_3', 'budget': '50 million USD or more', 'genres': ['Action', 'Crime', 'Drama'], 'trailer_url': 'No video found'}
{'title': 'The Godfather Part II', 'imdb_url': 'https://www.imdb.com/title/tt0071562/?ref_=chttp_t_4', 'budget': '10-25 million USD', 'genres': ['Crime', 'Drama'], 'trailer_url': 'No video found'}
{'title': '12 Angry Men', 'imdb_url': 'https://www.imdb.com/title/tt0050083/?ref_=chttp_t_5', 'budget': 'under 1 million USD', 'genres': ['Crime', 'Drama'], 'trailer_url': 'No video found'}
{'title': "Schindler's List"

In [12]:
def write_movies_to_csv(movies, filename='movies.csv'):
    fieldnames = ['title', 'imdb_url', 'budget', 'genres', 'trailer_url']
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for movie in movies:
            writer.writerow(movie)
write_movies_to_csv(movies)

# 2. Generate Shots

In [46]:
import cv2
import numpy as np
import youtube_dl
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import os
import datetime

In [24]:
def frame_diff(prev_frame, cur_frame):
    # Convert frames to grayscale
    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    cur_gray = cv2.cvtColor(cur_frame, cv2.COLOR_BGR2GRAY)
    
    # Compute difference and use threshold to find significant changes
    diff = cv2.absdiff(prev_gray, cur_gray)
    _, diff = cv2.threshold(diff, 50, 255, cv2.THRESH_BINARY)
    return np.sum(diff)

def gen_shorts(video_url):
    cap = cv2.VideoCapture(video_url)
    if not cap.isOpened():
        print("Error opening video stream")
        return []

    prev_frame = None
    shots = []

    while True:
        ret, curr_frame = cap.read()
        if not ret:
            break
        if prev_frame is not None:
            difference = frame_diff(prev_frame, curr_frame)
            if difference > 1000000:
                shots.append(curr_frame)
        prev_frame = curr_frame

    cap.release()
    return shots

def get_trailer_url(imdb_url):
    response = requests.get(imdb_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the trailer video link, typically embedded within an iframe or a specific HTML structure
    # This might need to be adjusted depending on the actual structure of the IMDB page
    video_section = soup.find('div', {'class': 'ipc-shoveler'})  # Often trailers are inside a carousel shoveler in IMDb
    if video_section:
        video_link = video_section.find('a', href=True)
        if video_link and 'video' in video_link['href']:
            return 'https://www.imdb.com' + video_link['href']
    return None

def download_trailer(video_url, output_filename):
    ydl_opts = {
        'format': 'bestvideo',
        'outtmpl': output_filename
    }
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])

In [44]:

# def get_video_src_from_imdb(imdb_url):
#     service = Service(ChromeDriverManager().install())
#     driver = webdriver.Chrome(service=service)

#     try:
#         driver.get(imdb_url)
#         video_element = WebDriverWait(driver, 10).until(
#             EC.presence_of_element_located((By.TAG_NAME, 'video'))
#         )
#         video_src = video_element.get_attribute('src')

#         return video_src
#     except Exception as e:
#         print(f"An error occurred: {e}")
#         return None
#     finally:
#         driver.quit()

imdb_url = 'https://www.imdb.com/title/tt0047478/'  # Example URL
video_url = get_video_src_from_imdb(imdb_url)
print(f"Video URL: {video_url}")



Video URL: https://imdb-video.media-imdb.com/vi2923348761/1434659607842-pgv4ql-1641601248908.mp4?Expires=1713532987&Signature=EdYab-rgHYliY~gFVrR3PyTFSr2Aw10fiO~RLBiOvILxQqbkk62MlkUqCFuOGEV024BQ6WZfyKYmh9Vt79p9YMn~589dK2rr-m1RNeoP1610C1jD0uvlWM6zBGdAHPcKvbbmR~O99TRMHkLmoyalaisUcbkJ5w7ALrca5W7ZBYq-96hZ8gf9SS0gTawuss3~KgYxw5MX-CZvJiEgHI7SAO6~XbbkCxNlylEnsczakymf0qer8hP3SCSDhPdeuGgN5VrKVdsQyCfOWKesOUNaM9BtrI5upLNhqDFCn5Fc1e0GKZrrVzdYq5r3eA-t5mh8gXjETqOmcXks7sGetoAzXA__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA


In [58]:
def download_video(video_url, output_path='/Users/zhongqihe/utm/year4/year4_winter/csc420/CinemaScopeAI/video/trailer.mp4'):
    if os.path.exists(output_path):
        os.remove(output_path)
    response = requests.get(video_url, stream=True)
    if response.status_code == 200:
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    file.write(chunk)
        return output_path
    else:
        print("Error downloading the video.")
        return None

def detect_shots(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_rate = cap.get(cv2.CAP_PROP_FPS)

    shots = [0]  # Start with the first frame as the first shot
    # Initialize histogram for the first frame
    ret, prev_frame = cap.read()
    if not ret:
        print("Failed to read the video")
        return []
    prev_hist = cv2.calcHist([prev_frame], [0], None, [256], [0,256])

    # Initialize variables for adaptive thresholding
    frame_diffs = []
    mean_diff = 0
    std_diff = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Calculate histogram of the current frame and calculate the difference
        curr_hist = cv2.calcHist([frame], [0], None, [256], [0,256])
        diff = cv2.compareHist(prev_hist, curr_hist, cv2.HISTCMP_CHISQR)
        frame_diffs.append(diff)

        # Update mean and standard deviation
        mean_diff = np.mean(frame_diffs[-30:])  # Consider the last 30 frames
        std_diff = np.std(frame_diffs[-30:]) if len(frame_diffs) > 30 else 0

        # Dynamic threshold
        dynamic_threshold = mean_diff + 4 * std_diff

        # If the difference is significant, mark it as a shot boundary
        if diff > dynamic_threshold:
            frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
            shots.append(frame_number)
            # print(f"Shot change at frame {frame_number} (time: {frame_number / frame_rate}s)")

        prev_hist = curr_hist

    cap.release()
    return shots

def extract_random_frames(video_path, shots):
    cap = cv2.VideoCapture(video_path)
    random_frames = []
    
    for i in range(len(shots) - 1):
        # Generate a random frame number between two shot boundaries
        frame_number = np.random.randint(shots[i], shots[i + 1])
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        ret, frame = cap.read()
        if ret:
            random_frames.append((frame_number,frame))
        else:
            print("Error: Couldn't read frame number ", frame_number)
    
    cap.release()
    return random_frames

video_path = download_video(video_url)
video_path = "https://imdb-video.media-imdb.com/vi696162841/1434659607842-pgv4ql-1564159556047.mp4?Expires=1713540729&Signature=hG9qlK4FCoIYdZzExdL-NN~p5sQwRdyFQDJRwUYTqBESiKCkxkYIgOAHshwl7EhXDvoj-SLt-e6VEuX~74LYkYq-WWPx0bKkQlDJRb2ZhJFHHTjW8k6MRSpowzMnsPzsVveXmeBNR1PDx995j0rp8aTgEMui0xQrAxp-7Ad857lgqKh8pOsdGocC5AwACbSlaAenD8R06Wxcr7oP9fe4fDoIGudoyrt3ckgqf6NUJmk-1zun5HePLKZ2ROucxamDRBy0X59cduIDjfRgJOOJdKI8f1M~Gvh~uQjD~JDQEbUPe0Dv-LrFQhVIbp9ifhhDcGvTTGx6XzBC8ZlId1-FBA__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA"

shots = detect_shots(video_path)
random_frames_info = extract_random_frames(video_path, shots)

for i, (frame_number, frame) in enumerate(random_frames_info):
    frame_path = f'frame_{i}_{frame_number}.jpg'
    cv2.imwrite(frame_path, frame)
    print(f'Saved frame {frame_number} as {frame_path}')


Saved frame 6 as frame_0_6.jpg
Saved frame 18 as frame_1_18.jpg
Saved frame 22 as frame_2_22.jpg
Saved frame 23 as frame_3_23.jpg
Saved frame 24 as frame_4_24.jpg
Saved frame 26 as frame_5_26.jpg
Saved frame 27 as frame_6_27.jpg
Saved frame 77 as frame_7_77.jpg
Saved frame 188 as frame_8_188.jpg
Saved frame 189 as frame_9_189.jpg
Saved frame 278 as frame_10_278.jpg
Saved frame 298 as frame_11_298.jpg
Saved frame 346 as frame_12_346.jpg
Saved frame 347 as frame_13_347.jpg
Saved frame 374 as frame_14_374.jpg
Saved frame 415 as frame_15_415.jpg
Saved frame 479 as frame_16_479.jpg
Saved frame 533 as frame_17_533.jpg
Saved frame 607 as frame_18_607.jpg
Saved frame 621 as frame_19_621.jpg
Saved frame 654 as frame_20_654.jpg
Saved frame 758 as frame_21_758.jpg
Saved frame 966 as frame_22_966.jpg
Saved frame 1022 as frame_23_1022.jpg
Saved frame 1032 as frame_24_1032.jpg
Saved frame 1152 as frame_25_1152.jpg
Saved frame 1212 as frame_26_1212.jpg
Saved frame 1240 as frame_27_1240.jpg
Saved fram

In [59]:
download_video("https://imdb-video.media-imdb.com/vi696162841/1434659607842-pgv4ql-1564159556047.mp4?Expires=1713540729&Signature=hG9qlK4FCoIYdZzExdL-NN~p5sQwRdyFQDJRwUYTqBESiKCkxkYIgOAHshwl7EhXDvoj-SLt-e6VEuX~74LYkYq-WWPx0bKkQlDJRb2ZhJFHHTjW8k6MRSpowzMnsPzsVveXmeBNR1PDx995j0rp8aTgEMui0xQrAxp-7Ad857lgqKh8pOsdGocC5AwACbSlaAenD8R06Wxcr7oP9fe4fDoIGudoyrt3ckgqf6NUJmk-1zun5HePLKZ2ROucxamDRBy0X59cduIDjfRgJOOJdKI8f1M~Gvh~uQjD~JDQEbUPe0Dv-LrFQhVIbp9ifhhDcGvTTGx6XzBC8ZlId1-FBA__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA", "/Users/zhongqihe/utm/year4/year4_winter/csc420/CinemaScopeAI/video/god_father2.mp4")

'/Users/zhongqihe/utm/year4/year4_winter/csc420/CinemaScopeAI/video/god_father2.mp4'

In [64]:
genres_counters = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
genres = ['action', 'adventure', 'animation', 'biography', 'comedy', 'crime',
          'documentary', 'drama', 'family', 'fantasy', 'history', 'Horror',
          'music', 'musical', 'mystery', 'romance', 'sci-fi', 'short film',
          'sport', 'superhero', 'thriller', 'war', 'western']
genres_counters = [0] * len(genres)
print(len(genres))
print(genres_counters)

# data processing and storing sudo code
# 1. for each row of the csv file:
#       1.1 extract title, budget, geners, and trailer_url
#       1.2 download the movie with trailer_url
#       1.3 use detect_shots() to find shot and randomly pick 1 frame per shot
#       1.4 save the selected shot locally with the following naming convension

#           genre + budget + genres_counters[genres.index[genre]]
#           GENRES_BUDGET_genres_counters
#       1.5 genres_counters[genres.index[genre]]++



23
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
