# Movie Recommendation System

## Loading movie data

In [None]:
import os
import numpy as np
import pandas as pd
from typing import Type

Define the CSV file of the movie table

In [None]:
data_dir: str = "data"
movies_csv: str = os.path.join(data_dir, "movies.csv")

Define the dtype for minimum required space

In [None]:
movies_dtype: dict[str, Type] = {
    "movieId": np.int32,
    "title": str,
    "genres": str
}

Load the CSV into a dataframe

In [None]:
movies_df: pd.DataFrame = pd.read_csv(movies_csv, dtype=movies_dtype)

In [None]:
movies_df.head()

## Cleaning movie data

Drop the rows with missing values

In [None]:
movies_clean: pd.DataFrame = movies_df.dropna()

In [None]:
movies_df.shape[0], movies_clean.shape[0]

## Genre one-hot encoding

In [None]:
import json
from scipy.sparse import spmatrix
from typing import Union
from numpy.typing import NDArray
from sklearn.preprocessing import MultiLabelBinarizer

Get a dataframe of just the `movieId` and `genres`

In [None]:
movies_feat: pd.DataFrame = movies_clean.copy()
movies_feat = movies_feat.drop(columns=["title"])

Turn the `genres` column into a list of strings

In [None]:
movies_feat["genres"] = movies_feat["genres"].str.split("|")

Fit transform the `genres` column into one-hot encodings

In [None]:
mlb: MultiLabelBinarizer = MultiLabelBinarizer()
mlb.fit_transform(movies_feat["genres"])

Define the JSON file to save the MLB classes into

In [None]:
cache_dir: str = "cache"
mlb_classes_json: str = os.path.join(cache_dir, "mlb_classes.json")

Cache the MLB classes into a JSON file for re-use

In [None]:
with open(mlb_classes_json, "w") as f:
    json.dump(mlb.classes_.tolist(), f)

Load the MLB with the cached classes to check if it works

In [None]:
with open(mlb_classes_json, "r") as f:
    mlb = MultiLabelBinarizer(classes=json.load(f))

Get the one-hot encodings

In [None]:
encodings: Union[NDArray[np.int32], spmatrix] = mlb.fit_transform(
    movies_feat["genres"])

Store the one-hot encodings as an integer as there are less than 32 genres in the dataframe

In [None]:
genre_bits: NDArray[np.int32] = np.zeros(len(movies_feat), dtype=np.int32)

for i in range(encodings.shape[1]):
    genre_bits = genre_bits | (encodings[:, i].astype(np.int32) << i)

movies_feat["genres"] = genre_bits

In [None]:
movies_feat.head()

## User Input

Define the CSV file containing title and rating

In [None]:
input_csv: str = os.path.join(data_dir, "input.csv")

Load the inputs into a dataframe

In [None]:
input_df: pd.DataFrame = pd.read_csv(input_csv)

In [None]:
input_df.head()

## Content-based filtering

In [None]:
import numba as nb
from collections import defaultdict

Use `numba` for fast performance

In [None]:
@nb.njit
def popcount(x):
    count = 0
    while x:
        count += x & 1
        x >>= 1
    return count

Pre-computer movie-genre mapping

In [None]:
movie_genres: dict[int, int] = dict(zip(movies_feat["movieId"],
                                        movies_feat["genres"]))

Calculate the sum similarity score across all input movies

In [None]:
similarity_scores: defaultdict[int] = defaultdict(int)

for _, row in input_df.iterrows():
    source_genres = movie_genres[row["movieId"]]

    for mov_id, target_genres in movie_genres.items():
        common = source_genres & target_genres
        extra = target_genres & ~source_genres

        sim_score = popcount(common) - popcount(extra)
        similarity_scores[mov_id] += sim_score

In [None]:
dict(list(similarity_scores.items())[:10])

## Defining CSV loader

In [None]:
from pandas.io.parsers import TextFileReader
from typing import Generator, Type

Read the CSV in chunks to optimize memory utilization

In [None]:
def read_csv_in_chunks(
    file_path: str,
    dtype: dict[str, Type],
    chunk_size: int = 100_000
) -> Generator[pd.DataFrame, None, None]:
    reader: TextFileReader = pd.read_csv(file_path, chunksize=chunk_size,
                                         dtype=dtype)
    for chunk in reader:
        chunk: pd.DataFrame
        yield chunk.dropna()

## Collabarative filtering

Precompute the movie-ratings mappings

In [None]:
input_ratings: dict[int, float] = dict(zip(input_df["movieId"], 
                                           input_df["rating"]))

Define the CSV file for ratings data

In [None]:
ratings_csv: str = os.path.join(data_dir, "ratings.csv")

Define the dtype for minimum required space

In [None]:
ratings_dtype: dict[str, Type] = {
    "userId": np.int32,
    "movieId": np.int32,
    "rating": np.float16,
    "timestamp": np.int32
}

Calculate the user similarity

In [None]:
user_errors: defaultdict[float] = defaultdict(float)
user_counts: defaultdict[int] = defaultdict(int)

for chunk in read_csv_in_chunks(ratings_csv, ratings_dtype):
    chunk_fil: pd.DataFrame = (chunk[chunk["movieId"].isin(input_ratings)]
                               .copy())
    mae = abs(chunk_fil["rating"] - chunk_fil["movieId"].map(
        input_ratings))
    chunk_fil["error"] = 1 / (1 + mae)

    for _, row in chunk_fil.iterrows():
        user_errors[row["userId"]] += row["error"]
        user_counts[row["userId"]] += 1

In [None]:
user_scores: dict[int, float] = {
    int(user): user_errors[user]
    for user in user_errors
    if user_counts[user] > 0
}

ranked_users: list[int, float] = sorted(user_scores.items(), 
                                        key=lambda x: x[1])
top_n_users: dict[int, float] = dict(ranked_users[:-10:-1])

In [None]:
top_n_users

Get the list of movies these users rate highest

In [None]:
movie_scores: defaultdict[float] = defaultdict(float)
movie_counts: defaultdict[int] = defaultdict(int)

for chunk in read_csv_in_chunks(ratings_csv, ratings_dtype):
    chunk_fil: pd.DataFrame = chunk[chunk["userId"].isin(top_n_users)].copy()

    for _, row in chunk_fil.iterrows():
        movie_scores[row["movieId"]] += row["rating"]
        movie_counts[row["movieId"]] += 1

In [None]:
movie_similarity_scores: defaultdict[float] = defaultdict(float)

for movie in movie_scores:
    if movie_counts[movie] > 0:
        movie_similarity_scores[movie] = (movie_scores[movie] 
                                          / movie_counts[movie])

## Final Result

In [None]:
from typing import Any

Get the final score by equally weighing in content-based and collaborative filtering

In [None]:
final_similarity_scores: dict[int, float] = {
    int(movie): movie_similarity_scores[movie] + similarity_scores[movie]
    for movie in similarity_scores
}

Get the top 50 movies

In [None]:
top_n_movies: list[int, float] = sorted(final_similarity_scores.items(), 
                      key=lambda x: x[1])[:-50:-1]
top_n_movie_scores: list[dict[str, Any]] = []

for item in top_n_movies:
    row = movies_clean[movies_clean["movieId"] == item[0]].iloc[0]
    top_n_movie_scores.append({
        "title": row["title"],
        "score": item[1]
    })

Define the output CSV file

In [None]:
output_csv: str = os.path.join(cache_dir, "output.csv")

Write the scores to the output CSV file

In [None]:
pd.DataFrame(top_n_movie_scores).to_csv(output_csv, index=False)