# Movie Recommendation System

## 1 Data Collection

### 1.1 Read the CSV files of the movie database

In [None]:
import os

In [None]:
data_dir: str = "data"

In [None]:
ratings_csv: str = os.path.join(data_dir, "ratings.csv")
movies_csv: str = os.path.join(data_dir, "movies.csv")

In [None]:
import pandas as pd
import numpy as np
from typing import Type

In [None]:
movies_dtype: dict[str, Type] = {
    "movieId": np.int32,
    "title": str,
    "genres": str
}

In [None]:
movies_df: pd.DataFrame = pd.read_csv(movies_csv, dtype=movies_dtype)

In [None]:
DF_CHUNK_SIZE: int = 100_000

In [None]:
from pandas.io.parsers.readers import TextFileReader

In [None]:
ratings_dtype: dict[str, Type] = {
    "userId": np.int32,
    "movieId": np.int32,
    "rating": np.float16,
    "timestamp": np.int32
}

In [None]:
ratings_df_chunks: TextFileReader = pd.read_csv(ratings_csv, 
                                                chunksize=DF_CHUNK_SIZE, 
                                                dtype=ratings_dtype)

In [None]:
movies_df.head()

## 2 Data Preparation

### 2.1 Clean the data

In [None]:
original_movie_rows: int = movies_df.shape[0]

In [None]:
movies_clean: pd.DataFrame = movies_df.dropna()

In [None]:
removed_movie_rows: int = original_movie_rows - movies_clean.shape[0]

In [None]:
removed_movie_rows

### 2.2 Content-based Filtering

#### 2.2.1 One hot encode the movie genres

In [None]:
movies_feat: pd.DataFrame = movies_clean.copy()
movies_feat = movies_feat.drop(columns=["title"])
movies_feat["genres"] = movies_feat["genres"].str.split("|")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb: MultiLabelBinarizer = MultiLabelBinarizer()
mlb.fit_transform(movies_feat["genres"])

In [None]:
cache_dir: str = "cache"

In [None]:
mlb_classes_json: str = os.path.join(cache_dir, "mlb_classes.json")

In [None]:
import json

In [None]:
with open(mlb_classes_json, "w") as f:
    json.dump(mlb.classes_.tolist(), f)

In [None]:
with open(mlb_classes_json, "r") as f:
    mlb = MultiLabelBinarizer(classes=json.load(f))

In [None]:
from scipy.sparse import spmatrix
from typing import Union
from numpy.typing import NDArray

In [None]:
encodings: Union[np.ndarray, spmatrix] = mlb.fit_transform(
    movies_feat["genres"])
genre_bits: NDArray[np.int32] = np.zeros(len(movies_feat), dtype=np.int32)

for i in range(encodings.shape[1]):
    genre_bits = genre_bits | (encodings[:, i].astype(np.int32) << i)

In [None]:
movies_feat["genres"] = genre_bits

In [None]:
movies_feat.head()

#### 2.2.2 Create a genre similarity matrix

In [None]:
movies_genres: NDArray[np.int32] = np.array(movies_feat["genres"],
                                            dtype=np.uint32)
movies_count: int = len(movies_genres)

In [None]:
import numba as nb

In [None]:
@nb.njit
def popcount(x):
    count = 0
    while x:
        count += x & 1
        x >>= 1
    return count

In [None]:
@nb.njit(parallel=True, fastmath=True)
def create_similarity_matrix(array: np.ndarray, arr_len: int):
    similarity_matrix: NDArray[np.int32] = np.zeros(
        (arr_len, arr_len), dtype=np.int32)

    for i in nb.prange(arr_len):
        for j in range(arr_len):
            common = array[i] & array[j]
            extra = array[j] & ~array[i]
            
            common_bits = 0
            extra_bits = 0
            
            common_bits = popcount(common)
            extra_bits = popcount(extra)
                
            similarity_matrix[i, j] = common_bits - extra_bits

    return similarity_matrix

In [None]:
genre_similarity_matrix: NDArray[np.int32] = create_similarity_matrix(
    movies_genres, movies_count)

In [None]:
genre_similarity_matrix

## 5 Model Testing

### 5.1 Collect user input

In [None]:
input_df: pd.DataFrame = pd.read_csv(os.path.join(data_dir, "input.csv"))

In [None]:
input_df.head()