# 1. Generate Movie Table

In [1]:
import requests
import re
from bs4 import BeautifulSoup
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import time


In [2]:
def setup_webdriver():
    service = Service(ChromeDriverManager().install())
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
    options = webdriver.ChromeOptions()
    options.add_argument(f"user-agent={user_agent}")
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920x1080")
    options.add_argument("--disable-extensions")
    options.add_experimental_option("prefs", {
        "profile.default_content_setting_values.notifications": 2
    })
    driver = webdriver.Chrome(service=service, options=options)
    return driver

webdriver = setup_webdriver()

In [None]:
def fetch_movie_page(driver, imdb_url):
    # Use the existing driver to open the page
    driver.get(imdb_url)
    html_source = driver.page_source
    soup = BeautifulSoup(html_source, 'html.parser')
    return soup

In [None]:
def get_top_250():
    imdb_url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"
    top_250_page = fetch_movie_page(webdriver, imdb_url)
    print(top_250_page)
    movies = top_250_page.find_all('li', class_='ipc-metadata-list-summary-item sc-10233bc-0 iherUv cli-parent')
    movie_details = []
    for movie in movies:
        title_element = movie.find('h3', class_='ipc-title__text')
        if title_element:
            movie_name = title_element.text.strip()
            cleaned_text = re.sub(r'^\d+\.\s*', '', movie_name)
            link_element = movie.find('a', class_='ipc-title-link-wrapper')
            movie_link = f"https://www.imdb.com{link_element['href']}" if link_element else "No link found"
            movie_details.append((cleaned_text, movie_link))
    return movie_details

top_250 = get_top_250()
print(top_250)

<html class="scriptsOn" lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/"><head><script async="" crossorigin="anonymous" src="https://images-na.ssl-images-amazon.com/images/I/31bJewCvY-L.js"></script><meta charset="utf-8"/><meta content="width=device-width" name="viewport"/><script>if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }</script><title>IMDb Top 250 Movies</title><meta content="As rated by regular IMDb voters." data-id="main" name="description"/><meta content="IMDb" property="og:site_name"/><meta content="IMDb Top 250 Movies" property="og:title"/><meta content="As rated by regular IMDb voters." property="og:description"/><meta content="website" property="og:type"/><meta content="https://m.media-amazon.com/images/G/01/imdb/images/social/imdb_logo.png" property="og:image"/><meta content="1000" property="og:image:height"/><meta content="1000" property="og:image:width"/><meta content="en_US" property="og:local

In [None]:
def budget_scale(amount):
    if amount < 1000000:
        return "under 1 million USD"
    elif amount >= 1000000 and amount < 10000000:
        return "1-10 million USD"
    elif amount >= 10000000 and amount < 25000000:
        return "10-25 million USD"
    elif amount >= 25000000 and amount < 50000000:
        return "25-50 million USD"
    else:
        return "50 million USD or more"

In [None]:
def read_processed_movies(filename='movies.csv'):
    processed_urls = []
    try:
        with open(filename, mode='r', newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for row in reader:
                processed_urls.append(row['imdb_url'])
    except FileNotFoundError:
        pass  # File does not exist; will be created on first write.
    return processed_urls

processed_movies = read_processed_movies()


In [None]:
def get_budget_and_genre(top_250, processed_movies, filename='movies.csv'):
    fieldnames = ['title', 'imdb_url', 'budget', 'genres', 'trailer_url']
    with open(filename, mode='a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Check if the file is empty and write the header if needed
        csvfile.seek(0, 2)  # Move the cursor to the end of the file
        if csvfile.tell() == 0:  # If file is empty, write a header
            writer.writeheader()
        
        for title, imdb_url in top_250:
            if imdb_url in processed_movies:
                continue  # Skip already processed movies
            
            movie_page = fetch_movie_page(webdriver, imdb_url)
            budget = None
            genres = []
            video_url = 'No video found'
            
            if movie_page:
                budget_item = movie_page.find('li', {'data-testid': 'title-boxoffice-budget'})
                if budget_item:
                    budget_content = budget_item.find('span', {'class': 'ipc-metadata-list-item__list-content-item'})
                    if budget_content:
                        amount = budget_content.text.strip()
                        number = re.findall(r'\d+[\d,\.]*', amount)
                        if number:
                            budget = int(''.join(number).replace(',', ''))
                            budget = budget_scale(budget)
                
                scroller = movie_page.find('div', class_='ipc-chip-list__scroller')
                if scroller:
                    genre_spans = scroller.find_all('span', class_='ipc-chip__text')
                    genres = [span.text for span in genre_spans]
                
                video_tag = movie_page.find('video', {'class': ['jw-video', 'jw-reset']})
                if video_tag:
                    video_url = video_tag['src']
                
                movie_data = {
                    'title': title,
                    'imdb_url': imdb_url,
                    'budget': budget,
                    'genres': genres,
                    'trailer_url': video_url
                }
                writer.writerow(movie_data)  # Write movie data immediately to the CSV file


processed_movies = read_processed_movies()
get_budget_and_genre(top_250, processed_movies, 'movies.csv')


In [None]:
def valid_check(movie_data):
    if not movie_data['budget'] or not movie_data['genres'] or movie_data['trailer_url'] == 'No video found':
        return False
    return True


# 2. Generate Shots

In [None]:
import cv2
import numpy as np
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import os
import datetime
import ast

In [None]:
def convert_string_to_list(genre_string):
    try:
        # Safely evaluate the string to a list
        return ast.literal_eval(genre_string)
    except ValueError:
        # In case the string is not a well-formed list string, return an empty list or some default value
        return []
        
def read_movie_data(filename):
    movie_data = []
    with open(filename, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            row['genres'] = convert_string_to_list(row['genres'])
            row['budget'] = row['budget'].strip()
            movie_data.append(row)
    return movie_data

# Usage
filename = 'movies.csv'
movie_data = read_movie_data(filename)
url_movie_data = [movie for movie in movie_data if valid_check(movie)]
print(len(url_movie_data))
print(url_movie_data[0]['budget'])

178
25-50 million USD


In [None]:
def get_10_genres(url_movie_data):
    genre_count = {}
    for movie in url_movie_data:
        for genre in movie['genres']:
            genre_count[genre] = genre_count.get(genre, 0) + 1
    sorted_genres = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
    return sorted_genres[:10]

top_10_genres = get_10_genres(url_movie_data)
print(top_10_genres)
genres_lst = [genre[0] for genre in top_10_genres]
print(genres_lst)

[('Drama', 122), ('Adventure', 54), ('Action', 41), ('Crime', 37), ('Comedy', 31), ('Thriller', 20), ('Animation', 20), ('Mystery', 20), ('Romance', 18), ('Biography', 17)]
['Drama', 'Adventure', 'Action', 'Crime', 'Comedy', 'Thriller', 'Animation', 'Mystery', 'Romance', 'Biography']


In [None]:
budget_lst = [
        "under 1 million USD",
        "1-10 million USD",
        "10-25 million USD",
        "25-50 million USD",
        "50 million USD or more"
    ]

In [None]:
def remove_no_genres(movie_data, genres):
    # This function returns movies that have at least one genre in the given list of genres
    return [movie for movie in movie_data if any(genre in genres for genre in movie['genres'])]

# Usage
url_movie_data = remove_no_genres(url_movie_data, genres_lst)
print(len(url_movie_data))

175


In [None]:
def download_video(video_url, output_path='./video/trailer.mp4'):
    if os.path.exists(output_path):
        os.remove(output_path)
    response = requests.get(video_url, stream=True)
    if response.status_code == 200:
        with open(output_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    file.write(chunk)
        return output_path
    else:
        print("Error downloading the video.")
        return None

def detect_shots(video_path):
    cap = cv2.VideoCapture(video_path)
    frame_rate = cap.get(cv2.CAP_PROP_FPS)

    shots = [0]  # Start with the first frame as the first shot
    # Initialize histogram for the first frame
    ret, prev_frame = cap.read()
    if not ret:
        print("Failed to read the video")
        return []
    prev_hist = cv2.calcHist([prev_frame], [0], None, [256], [0,256])

    # Initialize variables for adaptive thresholding
    frame_diffs = []
    mean_diff = 0
    std_diff = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Calculate histogram of the current frame and calculate the difference
        curr_hist = cv2.calcHist([frame], [0], None, [256], [0,256])
        diff = cv2.compareHist(prev_hist, curr_hist, cv2.HISTCMP_CHISQR)
        frame_diffs.append(diff)

        # Update mean and standard deviation
        mean_diff = np.mean(frame_diffs[-30:])  # Consider the last 30 frames
        std_diff = np.std(frame_diffs[-30:]) if len(frame_diffs) > 30 else 0

        # Dynamic threshold
        dynamic_threshold = mean_diff + 4 * std_diff

        # If the difference is significant, mark it as a shot boundary
        if diff > dynamic_threshold:
            frame_number = int(cap.get(cv2.CAP_PROP_POS_FRAMES))
            shots.append(frame_number)
            # print(f"Shot change at frame {frame_number} (time: {frame_number / frame_rate}s)")

        prev_hist = curr_hist

    cap.release()
    return shots

def extract_random_frames(video_path, shots):
    cap = cv2.VideoCapture(video_path)
    random_frames = []
    
    for i in range(len(shots) - 1):
        # Generate a random frame number between two shot boundaries
        frame_number = np.random.randint(shots[i], shots[i + 1])
        
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_number)
        ret, frame = cap.read()
        if ret:
            random_frames.append((frame_number,frame))
        else:
            print("Error: Couldn't read frame number ", frame_number)
    
    cap.release()
    return random_frames
# video_path = download_video(video_url)
# video_path = "https://imdb-video.media-imdb.com/vi696162841/1434659607842-pgv4ql-1564159556047.mp4?Expires=1713540729&Signature=hG9qlK4FCoIYdZzExdL-NN~p5sQwRdyFQDJRwUYTqBESiKCkxkYIgOAHshwl7EhXDvoj-SLt-e6VEuX~74LYkYq-WWPx0bKkQlDJRb2ZhJFHHTjW8k6MRSpowzMnsPzsVveXmeBNR1PDx995j0rp8aTgEMui0xQrAxp-7Ad857lgqKh8pOsdGocC5AwACbSlaAenD8R06Wxcr7oP9fe4fDoIGudoyrt3ckgqf6NUJmk-1zun5HePLKZ2ROucxamDRBy0X59cduIDjfRgJOOJdKI8f1M~Gvh~uQjD~JDQEbUPe0Dv-LrFQhVIbp9ifhhDcGvTTGx6XzBC8ZlId1-FBA__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA"

# shots = detect_shots(video_path)
# random_frames_info = extract_random_frames(video_path, shots)

# for i, (frame_number, frame) in enumerate(random_frames_info):
#     frame_path = f'frame_{i}_{frame_number}.jpg'
#     cv2.imwrite(frame_path, frame)
#     print(f'Saved frame {frame_number} as {frame_path}')

In [None]:
def assign_label(movie, count):
    # Initialize the genre part of the label with zeros
    genre_labels = ['0'] * 10
    budget_labels = ['0'] * 5
    # Check each genre in the movie and set the corresponding index if it's in the top 10
    for genre in movie['genres']:
        if genre in genres_lst:
            index = genres_lst.index(genre)
            genre_labels[index] = '1'

    # Convert genre labels list to a string
    genre_part = ''.join(genre_labels)
    
    budget = movie['budget']
    # Assign a budget label based on categories
    if budget in budget_lst:
        index = budget_lst.index(budget)
        budget_labels[index] = '1'
    budget_part = ''.join(budget_labels)
    
    # Form the final label as path or any structured string
    final_name = f"{genre_part} {budget_part} {str(count).zfill(5)}.jpg"
    
    return final_name

#Usage
example = url_movie_data[0]
print(example)
print(assign_label(example, 1))


{'title': 'The Shawshank Redemption', 'imdb_url': 'https://www.imdb.com/title/tt0111161/?ref_=chttp_t_1', 'budget': '25-50 million USD', 'genres': ['Drama'], 'trailer_url': 'https://imdb-video.media-imdb.com/vi3877612057/1434659607842-pgv4ql-1616202333253.mp4?Expires=1713547241&Signature=p7BmE3uA3YsP3WdIL-mS9z6lByxnkCgoQBViDEyZnp9ig2ySxKN4anLbkBxoys1Gv0tEGbRUOx5ID7n7MWrtKnuoEfxw1W-DiP8lGcgutsNF4YLjirpPd6HIHZ4Sm~ZYFaarC3DT8wumpOw4fV1C81jlL~8jfobkWtI8ehECdkKIpZNIeqT~o3jQjuZguPT1dxqy3oD3LIicY9eayhWlhvhV~lZHTtH~Fj1E9IoduEfaZHRB~WqG0u4kxVy1f4fcSvzQYyuAms0e6oJUD0gMVImgoGt2SvM8-syxpc5vsmm5a6j3GEPeXISXktT3ZjjuLyrPF-coi7H-nzi7-zf~Ng__&Key-Pair-Id=APKAIFLZBVQZ24NQH3KA'}
1000000000 00010 00001.jpg


In [None]:
def create_db(url_movie_data, databae_dir='./database'):
    if not os.path.exists(databae_dir):
        os.makedirs(databae_dir)
    count = 0
    for i, movie in enumerate(url_movie_data):
        video_url = movie['trailer_url']
        video_path = download_video(video_url)
        shots = detect_shots(video_path)
        random_frames_info = extract_random_frames(video_path, shots)
        for j, (frame_number, frame) in enumerate(random_frames_info):
            label = assign_label(movie, count)
            frame_path = os.path.join(databae_dir, label)
            cv2.imwrite(frame_path, frame)
            print(f'Saved frame {frame_number} as {frame_path}')
            count += 1
        os.remove(video_path)
        print(f"Deleted video {video_path}")

create_db(url_movie_data)

Saved frame 43 as ./database/1000000000 3 00000.jpg
Saved frame 44 as ./database/1000000000 3 00001.jpg
Saved frame 88 as ./database/1000000000 3 00002.jpg
Saved frame 153 as ./database/1000000000 3 00003.jpg
Saved frame 210 as ./database/1000000000 3 00004.jpg
Saved frame 221 as ./database/1000000000 3 00005.jpg
Saved frame 260 as ./database/1000000000 3 00006.jpg
Saved frame 312 as ./database/1000000000 3 00007.jpg
Saved frame 348 as ./database/1000000000 3 00008.jpg
Saved frame 391 as ./database/1000000000 3 00009.jpg
Saved frame 436 as ./database/1000000000 3 00010.jpg
Saved frame 481 as ./database/1000000000 3 00011.jpg
Saved frame 501 as ./database/1000000000 3 00012.jpg
Saved frame 541 as ./database/1000000000 3 00013.jpg
Saved frame 573 as ./database/1000000000 3 00014.jpg
Saved frame 575 as ./database/1000000000 3 00015.jpg
Saved frame 621 as ./database/1000000000 3 00016.jpg
Saved frame 680 as ./database/1000000000 3 00017.jpg
Saved frame 748 as ./database/1000000000 3 00018.