# Preprocess movie data and ratings

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import datetime
import threading
import re
import os
import time

from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from difflib import SequenceMatcher
from dotenv import load_dotenv

In [2]:
raw_movie_df = pd.read_csv("../data/raw/movies.csv", index_col=0)
raw_movie_df = raw_movie_df.drop(columns=['genres'])
raw_movie_df["genres"] = ""
raw_movie_df

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),
2,Jumanji (1995),
3,Grumpier Old Men (1995),
4,Waiting to Exhale (1995),
5,Father of the Bride Part II (1995),
...,...,...
209157,We (2018),
209159,Window of the Soul (2001),
209163,Bad Poems (2018),
209169,A Girl Thing (2001),


In [3]:
raw_movie_df.info()
print()
raw_movie_df['title'].info()
print()
raw_movie_df['genres'].info()

<class 'pandas.core.frame.DataFrame'>
Index: 62423 entries, 1 to 209171
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   62423 non-null  object
 1   genres  62423 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB

<class 'pandas.core.series.Series'>
Index: 62423 entries, 1 to 209171
Series name: title
Non-Null Count  Dtype 
--------------  ----- 
62423 non-null  object
dtypes: object(1)
memory usage: 975.4+ KB

<class 'pandas.core.series.Series'>
Index: 62423 entries, 1 to 209171
Series name: genres
Non-Null Count  Dtype 
--------------  ----- 
62423 non-null  object
dtypes: object(1)
memory usage: 975.4+ KB


In [4]:
raw_movie_df.tail()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
209157,We (2018),
209159,Window of the Soul (2001),
209163,Bad Poems (2018),
209169,A Girl Thing (2001),
209171,Women of Devil's Island (1962),


In [5]:
load_dotenv()
api_key = os.getenv('API_KEY')

In [6]:
# url = "https://api.themoviedb.org/3/search/movie"
# headers = {
#     "Authorization": api_key,
#     "Accept": "application/json"
# }
# params = {
#     "query": 'Potter'
# }
# response = requests.get(url, headers=headers, params=params)
# response.json()

## Create a link.csv file to link movieId to TMDb's movie id

In [7]:
lock = threading.Lock()
last_request_time = [0]

# rate_limited() rate limiter lock on thread
def rate_limited():
    with lock:
        now = time.time()
        elapsed = now - last_request_time[0]
        
        min_interval = 1 / 35  # ~35 requests/sec
        
        if elapsed < min_interval:
            time.sleep(min_interval - elapsed)
        
        last_request_time[0] = time.time()

In [8]:
# parse_title_year(title) gets the title and year
def parse_title_year(title):
    match = re.match(r"^(.*)\((\d{4})\)$", title)
    if match:
        name = match.group(1).strip()
        year = int(match.group(2))
        return name, year
    return title, None

# search_tmdb(title) searches TMDb API for title
def search_tmdb(title, retries=3):
    rate_limited()
    
    url = "https://api.themoviedb.org/3/search/movie"
    headers = {
        "Authorization": api_key,
        "Accept": "application/json"
    }
    params = {
        "query": title
    }

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, params=params, timeout=5)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 429:
                print(f"Rate limited. Sleeping... ({title})")
                time.sleep(2)
            elif response.status_code >= 500:
                print(f"Server error {response.status_code}. Retrying...")
            else:
                print(f"Client error {response.status_code} for {title}")
                return None
        except requests.exceptions.Timeout:
            print(f"Timeout for {title}")
        except requests.exceptions.ConnectionError:
            print(f"Connection error for {title}")
        except requests.exceptions.RequestException as e:
            print(f"Unexpected error: {e}")
        time.sleep(1)
    return None

# is_in_range(release_date, target_year, tolerance=1) checks if release date is in some range
def is_in_range(release_date, target_year, tolerance=1):
    if not release_date:
        return False
    
    release_year = datetime.datetime.strptime(release_date, "%Y-%m-%d").year
    return (int(target_year) - tolerance) <= release_year <= (int(target_year) + tolerance)

# similar(a, b) checks the similarity of two strings
def similar(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# get_best_match(results, title, year) gets the best matching movie id from API results
def get_best_match(results, title, year):
    best = None
    best_score = 0

    for r in results:
        tmdb_title = r.get("original_title", "")
        release_date = r.get("release_date")

        if not release_date:
            continue
        
        score = 0
        
        # Title similarity
        score += similar(tmdb_title, title) * 0.7

        # Year match
        if year and is_in_range(release_date, year):
            score += 0.2

        # Popularity
        score += min(r.get("popularity", 0) / 1000, 0.1)

        if score > best_score:
            best = r
            best_score = score

    if best_score < 0.85:
        return None
        
    return best["id"] if best else None

# get_tmdb_id(title, year) searches TMDb API then returns best matching movie id
def get_tmdb_id(title, year):
    data = search_tmdb(title)

    if "results" not in data:
        return None

    return get_best_match(data["results"], title, year)

# process_row(row) process each row in the movie table
def process_row(row):
    raw_title = row["title"]
    movie_id = row["movieId"]

    title, year = parse_title_year(raw_title)
    tmdb_id = get_tmdb_id(title, year)

    return {
        "movieId": movie_id,
        "tmdbId": tmdb_id
    }

In [9]:
# Test
get_tmdb_id("Toy Story","1995")

862

In [None]:
# links = []

# # Convert to list
# rows = [row for _, row in raw_movie_df.iterrows()]
# with ThreadPoolExecutor(max_workers=15) as executor:
#     futures = [executor.submit(process_row, row) for row in rows]

#     for future in tqdm(as_completed(futures), total=len(futures)):
#         links.append(future.result())

# links_df = pd.DataFrame(links)
# links_df.to_csv("../data/processed/links.csv", index=False)

[title     Toy Story (1995)
 genres                    
 Name: 1, dtype: object,
 title     Jumanji (1995)
 genres                  
 Name: 2, dtype: object,
 title     Grumpier Old Men (1995)
 genres                           
 Name: 3, dtype: object,
 title     Waiting to Exhale (1995)
 genres                            
 Name: 4, dtype: object,
 title     Father of the Bride Part II (1995)
 genres                                      
 Name: 5, dtype: object,
 title     Heat (1995)
 genres               
 Name: 6, dtype: object,
 title     Sabrina (1995)
 genres                  
 Name: 7, dtype: object,
 title     Tom and Huck (1995)
 genres                       
 Name: 8, dtype: object,
 title     Sudden Death (1995)
 genres                       
 Name: 9, dtype: object,
 title     GoldenEye (1995)
 genres                    
 Name: 10, dtype: object,
 title     American President, The (1995)
 genres                                  
 Name: 11, dtype: object,
 title     Dracula: