# Movie Recommendation System

#### Load the CSV files into dataframe

In [None]:
import os

In [None]:
data_dir: str = "data"

In [None]:
ratings_csv: str = os.path.join(data_dir, "ratings.csv")
movies_csv: str = os.path.join(data_dir, "movies.csv")

In [None]:
import pandas as pd
import numpy as np
from typing import Any

In [None]:
movies_dtype: dict[str, Any] = {
    "movieId": np.int32,
    "title": str,
    "genres": str
}

In [None]:
movies_df: pd.DataFrame = pd.read_csv(movies_csv, dtype=movies_dtype)

In [None]:
DF_CHUNK_SIZE: int = 100_000

In [None]:
from pandas.io.parsers.readers import TextFileReader

In [None]:
ratings_dtype: dict[str, Any] = {
    "userId": np.int32,
    "movieId": np.int32,
    "rating": np.float16,
    "timestamp": np.int32
}

In [None]:
ratings_df_chunks: TextFileReader = pd.read_csv(ratings_csv, 
                                                chunksize=DF_CHUNK_SIZE, 
                                                dtype=ratings_dtype)

In [None]:
movies_df.head()

#### One hot encode the genres

In [None]:
movies_df["genres"] = movies_df["genres"].str.split("|")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb: MultiLabelBinarizer = MultiLabelBinarizer()
mlb.fit_transform(movies_df["genres"])

In [None]:
cache_dir: str = "cache"

In [None]:
mlb_classes_csv = os.path.join(cache_dir, "mlb_classes.csv")

In [None]:
mlb_classes_df = pd.DataFrame({"pos": i, "class": c} 
                              for i, c in enumerate(mlb.classes_))

In [None]:
mlb_classes_df.to_csv(mlb_classes_csv)

In [None]:
with open(mlb_classes_csv, "r") as f:
    mlb = MultiLabelBinarizer(classes=pd.read_csv(mlb_classes_csv)["class"])

In [None]:
encodings = mlb.fit_transform(movies_df["genres"])
genre_bits = np.zeros(len(movies_df), dtype=np.int32)

for i in range(encodings.shape[1]):
    genre_bits = genre_bits | (encodings[:, i].astype(np.int32) << i)

In [None]:
movies_df["genres"] = genre_bits

In [None]:
movies_df.head()

#### Collect user input

In [None]:
input_df = pd.read_csv(os.path.join(data_dir, "input.csv"))

In [None]:
input_df.head()