# Movie Recommendation System

#### Load the CSV files into dataframe

In [None]:
import os

In [None]:
data_dir: str = "data"

ratings_csv: str = os.path.join(data_dir, "ratings.csv")
movies_csv: str = os.path.join(data_dir, "movies.csv")

In [None]:
import pandas as pd
import numpy as np
from typing import Any

In [None]:
movies_dtype: dict[str, Any] = {
    "movieId": np.int32,
    "title": str,
    "genres": str
}

movies_df: pd.DataFrame = pd.read_csv(movies_csv, dtype=movies_dtype)

In [None]:
DF_CHUNK_SIZE: int = 100_000

In [None]:
from pandas.io.parsers.readers import TextFileReader

In [None]:
ratings_dtype: dict[str, Any] = {
    "userId": np.int32,
    "movieId": np.int32,
    "rating": np.float16,
    "timestamp": np.int32
}

ratings_df_chunks: TextFileReader = pd.read_csv(ratings_csv, 
                                                chunksize=DF_CHUNK_SIZE, 
                                                dtype=ratings_dtype)

In [None]:
movies_df.head()

#### One hot encode the genres

In [None]:
movies_df["genres"] = movies_df["genres"].str.split("|")

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb: MultiLabelBinarizer = MultiLabelBinarizer()
mlb.fit_transform(movies_df["genres"])

In [None]:
cache_dir: str = "cache"

In [None]:
mlb_classes_json = os.path.join(cache_dir, "mlb_classes.json")

In [None]:
import json

In [None]:
with open(mlb_classes_json, "w") as f:
    json.dump(mlb.classes_.tolist(), f, indent=2)

In [None]:
with open(mlb_classes_json, "r") as f:
    mlb = MultiLabelBinarizer(classes=json.load(f))

In [None]:
encodings = mlb.fit_transform(movies_df["genres"])
genre_bits = np.zeros(len(movies_df), dtype=np.int32)

for i in range(encodings.shape[1]):
    genre_bits = genre_bits | (encodings[:, i].astype(np.int32) << i)

In [None]:
movies_df["genres"] = genre_bits

In [None]:
movies_df.head()

#### Merge ratings and movies into a single dataframe

In [None]:
merged_df_chunks: list[pd.DataFrame] = []

for chunk in ratings_df_chunks:
    chunk: pd.DataFrame
    merged_df_chunks.append(chunk.merge(movies_df, on="movieId", how="left"))

In [None]:
merged_df_dir = os.path.join(cache_dir, "merged_df")
os.makedirs(merged_df_dir, exist_ok=True)

In [None]:
for i, chunk in enumerate(merged_df_chunks):
    file = f"merged_df_chunk_{i:0{len(str(len(merged_df_chunks)))}}.csv"
    chunk.to_csv(os.path.join(merged_df_dir, file), index=False)

In [None]:
merged_dtype = {
    "userId": np.int32,
    "movieId": np.int32,
    "rating": np.float16,
    "timestamp": np.int32,
    "title": str,
    "genres": np.int32
}

In [None]:
merged_df_chunks: list[pd.DataFrame] = []

for file in sorted(os.listdir(merged_df_dir)):
    file_path = os.path.join(merged_df_dir, file)
    merged_df_chunks.append(pd.read_csv(file_path, dtype=merged_dtype))

In [None]:
merged_df_chunks[0].head()