# Movie Recommendation System

## Loading movie data

In [None]:
data_dir: str = "data"

In [None]:
import os

Define the CSV file of the movie table

In [None]:
movies_csv: str = os.path.join(data_dir, "movies.csv")

In [None]:
import numpy as np
from typing import Type

Define the dtype for minimum required space

In [None]:
movies_dtype: dict[str, Type] = {
    "movieId": np.int32,
    "title": str,
    "genres": str
}

In [None]:
import pandas as pd

Load the CSV into a dataframe

In [None]:
movies_df: pd.DataFrame = pd.read_csv(movies_csv, dtype=movies_dtype)

In [None]:
movies_df.head()

## Cleaning movie data

Drop the rows with missing values

In [None]:
movies_clean: pd.DataFrame = movies_df.dropna()

In [None]:
movies_df.shape[0], movies_clean.shape[0]

## Genre one-hot encoding

Get a dataframe of just the `movieId` and `genres`

In [None]:
movies_feat: pd.DataFrame = movies_clean.copy()
movies_feat = movies_feat.drop(columns=["title"])

Turn the `genres` column into a list of strings

In [None]:
movies_feat["genres"] = movies_feat["genres"].str.split("|")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

Fit transform the `genres` column into one-hot encodings

In [None]:
mlb: MultiLabelBinarizer = MultiLabelBinarizer()
mlb.fit_transform(movies_feat["genres"])

In [None]:
cache_dir: str = "cache"

Define the JSON file to save the MLB classes into

In [None]:
mlb_classes_json: str = os.path.join(cache_dir, "mlb_classes.json")

In [None]:
import json

Cache the MLB classes into a JSON file for re-use

In [None]:
with open(mlb_classes_json, "w") as f:
    json.dump(mlb.classes_.tolist(), f)

Load the MLB with the cached classes to check if it works

In [None]:
with open(mlb_classes_json, "r") as f:
    mlb = MultiLabelBinarizer(classes=json.load(f))

In [None]:
from scipy.sparse import spmatrix
from typing import Union
from numpy.typing import NDArray

Get the one-hot encodings

In [None]:
encodings: Union[NDArray[np.int32], spmatrix] = mlb.fit_transform(
    movies_feat["genres"])

Store the one-hot encodings as an integer as there are less than 32 genres in the dataframe

In [None]:
genre_bits: NDArray[np.int32] = np.zeros(len(movies_feat), dtype=np.int32)

for i in range(encodings.shape[1]):
    genre_bits = genre_bits | (encodings[:, i].astype(np.int32) << i)

In [None]:
movies_feat["genres"] = genre_bits

In [None]:
movies_feat.head()

## User Input

Define the CSV file containing title and rating

In [None]:
input_csv: str = os.path.join(data_dir, "input.csv")

Load the inputs into a dataframe

In [None]:
input_df: pd.DataFrame = pd.read_csv(input_csv)

In [None]:
input_df.head()

## Input fuzzy matching

In [None]:
from rapidfuzz import process
from typing import Any

Get the `movieId` by doing a fuzzy search on the title

In [None]:
input_feat_list: list[dict[str, Any]] = []

for _, row in input_df.iterrows():
    matched: tuple[str, float, int] = process.extractOne(row["title"],
                                                         movies_clean["title"])
    if not matched or not matched[0]:
        raise Exception(f"Invalid match for {row['title']}: {matched[0]}")

    movie_row = movies_clean[movies_clean["title"] == matched[0]].iloc[0]

    confirmation_prompt = (f"Got '{movie_row['title']}' for '{row['title']}'. "
                           f"Add? (Y/n)?")
    confirmation = input(confirmation_prompt) or "y"

    match confirmation.lower():
        case "y" | "yes":
            input_feat_list.append({
                "movieId": movie_row["movieId"],
                "rating": row["rating"]
            })

        case "n" | "no" | _:
            pass

In [None]:
input_feat = pd.DataFrame(input_feat_list)

In [None]:
input_feat.head()

Define the CSV to save the input features

In [None]:
input_feat_csv: str = os.path.join(cache_dir, "input_feat.csv")

Save the input features to CSV

In [None]:
input_feat.to_csv(input_feat_csv, index=False)

Load the input features from cache to check if it works

In [None]:
input_feat = pd.read_csv(input_feat_csv)

In [None]:
input_feat.head()

## Content-based filtering

In [None]:
import numba as nb

Use `numba` for fast performance

In [None]:
@nb.njit
def popcount(x):
    count = 0
    while x:
        count += x & 1
        x >>= 1
    return count

Pre-computer movie-genre mapping

In [None]:
movie_genres = dict(zip(movies_feat["movieId"], movies_feat["genres"]))

Calculate the sum similarity score across all input movies

In [None]:
similarity_scores = {}

for _, row in input_feat.iterrows():
    source_genres = movie_genres[row["movieId"]]

    for mov_id, target_genres in movie_genres.items():
        common = source_genres & target_genres
        extra = target_genres & ~source_genres

        sim_score = popcount(common) - popcount(extra)
        similarity_scores[mov_id] = similarity_scores.get(mov_id, 0) + sim_score

In [None]:
dict(list(similarity_scores.items())[:10])

## Define the CSV loader

In [None]:
from pandas.io.parsers import TextFileReader
from typing import Generator, Type

Read the CSV in chunks to optimize memory utilization

In [None]:
def read_csv_in_chunks(
    file_path: str,
    dtype: dict[str, Type],
    chunk_size: int = 100_000
) -> Generator[pd.DataFrame, None, None]:
    reader: TextFileReader = pd.read_csv(file_path, chunksize=chunk_size,
                                         dtype=dtype)
    for chunk in reader:
        chunk: pd.DataFrame
        yield chunk.dropna()

## Collabarative filtering