# Movie Recommendation System

In [None]:
import os
import numpy as np
from typing import Type

In [None]:
DATA_DIRECTORY: str = "data"
MOVIES_CSV: str = os.path.join(DATA_DIRECTORY, "movies.csv")
INPUT_CSV: str = os.path.join(DATA_DIRECTORY, "input.csv")
RATINGS_CSV: str = os.path.join(DATA_DIRECTORY, "ratings.csv")

MOVIES_DTYPE: dict[str, Type] = {
    "movieId": np.int32,
    "title": str,
    "genres": str
}
RATINGS_DTYPE: dict[str, Type] = {
    "userId": np.int32,
    "movieId": np.int32,
    "rating": np.float16,
    "timestamp": np.int32
}

CHUNK_SIZE: int = 100_000

## 1. Preparing the movie data

In [None]:
import pandas as pd
from scipy.sparse import spmatrix
from typing import Union
from numpy.typing import NDArray
from sklearn.preprocessing import MultiLabelBinarizer

1. Load the movie data from the file "movies.csv" consisting of the columns `movieId`, `title`, and `genres` into the dataframe `movies_df`

In [None]:
movies_df: pd.DataFrame = pd.read_csv(MOVIES_CSV, dtype=MOVIES_DTYPE)

2. Drop the rows with missing values

In [None]:
print(f"Before: {movies_df.shape[0]}")
movies_df = movies_df.dropna()
print(f"After: {movies_df.shape[0]}")

3. One-hot encode the `genres` column by splitting with "|"

In [None]:
mlb: MultiLabelBinarizer = MultiLabelBinarizer()
encodings: Union[NDArray[np.int32], spmatrix] = mlb.fit_transform(
    movies_df["genres"].str.split("|"))

4. Store the one-hot encodings back into the `genres` column by encoding them as 32-bit integers (since we have less than 32 genres)

In [None]:
genre_bits: NDArray[np.int32] = np.zeros(len(movies_df), dtype=np.int32)
for i in range(encodings.shape[1]):
    genre_bits = genre_bits | (encodings[:, i].astype(np.int32) << i)

movies_df["genres"] = genre_bits

In [None]:
movies_df.head()

## 2. Preparing the input data

1. Load the movie data from the file "input.csv" consisting of the columns `movieId`, `title` and `rating` into the dataframe `input_df`

In [None]:
input_df: pd.DataFrame = pd.read_csv(INPUT_CSV)

2. Drop the rows with missing values

In [None]:
print(f"Before: {input_df.shape[0]}")
input_df = input_df.dropna()
print(f"After: {input_df.shape[0]}")

3. Drop the rows whose `movieId` is not present in `movies_df`

In [None]:
print(f"Before: {input_df.shape[0]}")
input_df = input_df[input_df["movieId"].isin(movies_df["movieId"])]
print(f"After: {input_df.shape[0]}")

## 3. Content-based filtering

In [None]:
from collections import defaultdict

1. Pre-compute the `genre_mappings`

In [None]:
genre_mappings: dict[int, int] = dict(zip(movies_df["movieId"],
                                          movies_df["genres"]))

2. Pre-compute the `genre_strengths`

In [None]:
genre_strengths: defaultdict[int, int] = defaultdict(int)

for movie_id in input_df["movieId"]:
    for i in range(0, 32):
        genre_strengths[i] += ((genre_mappings[movie_id] >> i) & 1)

In [None]:
genre_strengths

3. Compute the genre similarity scores

In [None]:
genre_similarities: defaultdict[int, float] = defaultdict(float)

for movie_id, movie_genres in genre_mappings.items():
    for input_movie_id in input_df["movieId"]:
        common = genre_mappings[input_movie_id] & movie_genres
        for i in range(0, 32):
            genre_similarities[movie_id] += ((common >> i) & 1) * genre_strengths[i]

4. Normalize the `genre_similarities` in the range 0 to 1

In [None]:
max_score = max(genre_similarities.values())
genre_similarities = {k: v / max_score for k, v in genre_similarities.items()}

In [None]:
genre_similarities

## 4. Preparing the rating data

In [None]:
from pandas.io.parsers import TextFileReader
from typing import Generator, Type

In [None]:
def read_csv_in_chunks(
    file_path: str,
    dtype: dict[str, Type],
    chunk_size: int
) -> Generator[pd.DataFrame, None, None]:
    reader: TextFileReader = pd.read_csv(file_path, chunksize=chunk_size,
                                         dtype=dtype)
    for chunk in reader:
        chunk: pd.DataFrame
        yield chunk.dropna()

1. Load the movie data from the file "ratings.csv" consisting of the columns `userId`, `movieId`, `rating`, `timestamp` into the dataframe `ratings_df`

In [None]:
def get_ratings_df() -> Generator[pd.DataFrame, None, None]:
    return read_csv_in_chunks(RATINGS_CSV, RATINGS_DTYPE, CHUNK_SIZE)

2. Pre-compute the `rating_mappings`

In [None]:
rating_mappings: defaultdict[int, float] = dict(zip(input_df["movieId"],
                                                    input_df["rating"]))

## 5. Collabarative filtering

1. Compute the user similarity scores

In [None]:
user_scores: defaultdict[float] = defaultdict(float)
user_counts: defaultdict[int] = defaultdict(int)

for chunk in get_ratings_df():
    chunk_fil: pd.DataFrame = (chunk[chunk["movieId"].isin(rating_mappings)]
                               .copy())
    mae = 1 + abs(chunk_fil["rating"]
                  - chunk_fil["movieId"].map(rating_mappings))
    chunk_fil["score"] = 1 / mae

    for _, row in chunk_fil.iterrows():
        user_scores[row["userId"]] += row["score"]
        user_counts[row["userId"]] += 1

In [None]:
user_similarities: list[tuple[int, float]] = [
    (int(u), user_scores[u]) for u in user_scores if user_counts[u] > 0
]

2. Get the top 10 users from the `user_similarities` and put them in `top_users`

In [None]:
top_users: list[tuple[int, float]] = sorted(
    user_similarities, key=lambda x: x[1], reverse=True)[:10]

In [None]:
top_users

3. Compute the scores for movies rated by users in `top_users`

In [None]:
movie_scores: defaultdict[float] = defaultdict(float)
movie_counts: defaultdict[int] = defaultdict(int)

for chunk in get_ratings_df():
    chunk_fil: pd.DataFrame = chunk[chunk["userId"].isin(top_users)].copy()

    for _, row in chunk_fil.iterrows():
        movie_scores[row["movieId"]] += row["rating"]
        movie_counts[row["movieId"]] += 1

In [None]:
similar_movie_scores: dict[int, float] = {
    int(m): movie_scores[m] / movie_counts[m] for m in movie_scores
    if movie_counts[m] > 0
}

4. Normalize the `similar_movie_scores` in the range 0 to 1

In [None]:
max_scores = max(similar_movie_scores.values())
similar_movie_scores = {
    k: v / max_scores for k, v in similar_movie_scores.items()
}

## Final Result

1. Compute the final result

In [None]:
final_scores: list[tuple[int, float]] = [
    (int(m), 
     0.7 * genre_similarities[m] + 0.3 * similar_movie_scores.get(m, 0))
    for m in genre_similarities
]

2. Get the top 50 movies from the `final_scores` and store them in `top_movies`

In [None]:
top_movies: list[tuple[int, float]] = sorted(
    final_scores, key=lambda x: x[1], reverse=True)[:10]

In [None]:
for id, score in top_movies:
    print(movies_df[movies_df['movieId'] == id].iloc[0]['title'],
          score)