# Preprocess movie data and ratings

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import datetime
import threading
import re
import os
import time
import aiohttp
import asyncio

from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from difflib import SequenceMatcher
from dotenv import load_dotenv

In [None]:
raw_movie_df = pd.read_csv("../data/raw/movies.csv")
movie_df = raw_movie_df.drop(columns=['genres'])
movie_df["genres"] = ""
movie_df

In [None]:
movie_df.info()
print()
movie_df['title'].info()
print()
movie_df['genres'].info()

In [None]:
movie_df.tail()

In [None]:
load_dotenv()
api_key = os.getenv('API_KEY')

In [None]:
# # Test code
# url = "https://api.themoviedb.org/3/search/movie"
# headers = {
#     "Authorization": api_key,
#     "Accept": "application/json"
# }
# params = {
#     "query": 'Potter'
# }
# response = requests.get(url, headers=headers, params=params)
# response.json()

## Create a link.csv file to link movieId to TMDb's movie id

In [None]:
# Semaphore to manage rate limits
semaphore = asyncio.Semaphore(39)

In [None]:
# parse_title_year(title) gets the title and year
def parse_title_year(title):
    match = re.match(r"^(.*)\((\d{4})\)$", title)
    if match:
        name = match.group(1).strip()
        year = int(match.group(2))
        return name, year
    return title, None

# search_tmdb(title) searches TMDb API for title
async def search_tmdb(session, title, retries=3):
    url = "https://api.themoviedb.org/3/search/movie"
    headers = {
        "Authorization": api_key,
        "Accept": "application/json"
    }
    params = {
        "query": title
    }

    for attempt in range(retries):
        async with semaphore:
            try:
                async with session.get(url, headers=headers, params=params, timeout=5) as response:
                    if response.status_code == 200:
                        return response.json()
                    elif response.status_code == 429:
                        tqdm.write(f"Rate limited. Sleeping... ({title})")
                        asyncio.sleep(2)
                    elif response.status_code >= 500:
                        tqdm.write(f"Server error {response.status_code} while querying {title}. Retrying...")
                    else:
                        tqdm.write(f"Client error {response.status_code} for {title}")
                        return None
            except asyncio.TimeoutError:
                tqdm.write(f"Timeout for {title}")
            except aiohttp.ClientError as e:
                tqdm.write(f"Connection error: {title} - {e}")
        await asyncio.sleep(1)
    return None

# is_in_range(release_date, target_year, tolerance=1) checks if release date is in some range
def is_in_range(release_date, target_year, tolerance=1):
    if not release_date:
        return False
    
    release_year = datetime.datetime.strptime(release_date, "%Y-%m-%d").year
    return (int(target_year) - tolerance) <= release_year <= (int(target_year) + tolerance)

# similar(a, b) checks the similarity of two strings
def similar(a, b):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# get_best_match(results, title, year) gets the best matching movie id from API results
def get_best_match(results, title, year):
    best = None
    best_score = 0

    for r in results:
        tmdb_title = r.get("original_title", "")
        release_date = r.get("release_date")

        if not release_date:
            continue
        
        score = 0
        
        # Title similarity
        score += similar(tmdb_title, title) * 0.7

        # Year match
        if year and is_in_range(release_date, year):
            score += 0.2

        # Popularity
        score += min(r.get("popularity", 0) / 1000, 0.1)

        if score > best_score:
            best = r
            best_score = score

    if best_score < 0.85:
        return None
        
    return best["id"] if best else None

# get_tmdb_id(title, year) searches TMDb API then returns best matching movie id
async def get_tmdb_id(session, title, year):
    data = await search_tmdb(session, title)

    if not data or "results" not in data:
        return None

    return get_best_match(data["results"], title, year)

# process_row(row) process each row in the movie table
async def process_row(session, row):
    raw_title = row["title"]
    movie_id = row["movieId"]

    title, year = parse_title_year(raw_title)
    tmdb_id = await get_tmdb_id(session, title, year)

    return {
        "movieId": movie_id,
        "tmdbId": tmdb_id
    }

In [None]:
# Test
get_tmdb_id("Toy Story","1995")

In [None]:
# create_link(df) uses the dataframe to create 
async def create_link():
    async with aiohttp.ClientSession() as session:
        tasks = [process_row(session, row) for _, row in raw_movie_df.iterrows()]

        results = []
        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
            result = await coro
            results.append(result)

        return results

In [None]:
# results = asyncio.run(create_link(raw_movie_df))

# links_df = pd.DataFrame(results)
# links_df.to_csv("../data/processed/links.csv", index=False)