# 1 - RNN Based Ratings Model

## Dependencies

In [None]:
%load_ext autoreload
%load_ext nb_black
%autoreload 2

import sys; sys.path.append('../')

from pathlib import Path

import pandas as pd
import plotly.express as px
import plotly.io as pio
import torch
from sklearn.preprocessing import LabelEncoder
from torch.nn import MSELoss

from src.models.rnn.data import get_dataset
from src.models.rnn.model import RNNRatings
from src.models.rnn.trainer import Trainer
from src.util import metrics
from src.util.data import get_interactions, get_sparsity_factor, get_train_test_ratings
from src.util.discretizer import RatingDiscretizer

pio.renderers.default = "notebook"

DEVICE = torch.device("cpu")
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")

RATINGS_PATH = Path("../data/ratings_small.csv")
OUTPUT_PATH = Path("../models/embedded_regression.pt")

## Data

In [None]:
ratings = pd.read_csv(RATINGS_PATH)

user_encoder = LabelEncoder()
user_encoder.fit(ratings["userId"].values)

movie_encoder = LabelEncoder()
movie_encoder.fit(ratings["movieId"].values)

In [None]:
ratings["rating"] /= ratings["rating"].values.max()

train_ratings, test_ratings = get_train_test_ratings(ratings)

train_ratings = train_ratings.sort_values(by="timestamp", ascending=True)
test_ratings = test_ratings.sort_values(by="timestamp", ascending=True)

In [None]:
train_interactions = get_interactions(train_ratings, user_encoder, movie_encoder)
test_interactions = get_interactions(test_ratings, user_encoder, movie_encoder)

train_sparsity = get_sparsity_factor(train_interactions)
test_sparsity = get_sparsity_factor(test_interactions)

In [None]:
print(f"Train sparsity: {(train_sparsity * 100):.3f}%")
print(f"Test sparsity: {(test_sparsity * 100):.3f}%")

In [None]:
# ? binarization is used only to validate ranking metrics

rating_discretizer = RatingDiscretizer()
train_discretized_ratings = rating_discretizer.fit_transform(train_ratings)
test_discretized_ratings = rating_discretizer.transform(test_ratings)

In [None]:
train_ratings["userId"] = user_encoder.transform(train_ratings["userId"].values)
test_ratings["userId"] = user_encoder.transform(test_ratings["userId"].values)

train_ratings["movieId"] = movie_encoder.transform(train_ratings["movieId"].values)
test_ratings["movieId"] = movie_encoder.transform(test_ratings["movieId"].values)

## Model

In [None]:
model = RNNRatings(
    train_interactions,
    n_factors=10,
    user_encoder=user_encoder,
    movie_encoder=movie_encoder,
)

model.to(DEVICE)

trainer = Trainer(
    loss=MSELoss(),
    regularizers=[],
    lr=1e-3,
    weight_decay=1e-7,
    epochs=5,
    batch_size=1_000,
)

In [None]:
train_dataset = get_dataset(train_ratings, DEVICE)
test_dataset = get_dataset(test_ratings, DEVICE)

In [None]:
trainer.fit(model, train_dataset, test_dataset)

In [None]:
loss_history = trainer.get_loss_history()

fig = px.line(
    loss_history,
    x="epoch",
    y="value",
    color="loss",
    title="Convergence",
    labels={
        "epoch": "Epochs",
        "loss": "Loss",
        "value": "MSE"
    }
)
fig.show()

In [None]:
model.eval()

with torch.no_grad():
    mean_reciprocal_rank, reciprocal_ranks = metrics.mean_reciprocal_rank(
        test_discretized_ratings,
        model
    )

In [None]:
with torch.no_grad():
    mean_ndcg, ndcg_ranks = metrics.mean_ndcg(
        test_discretized_ratings,
        model
    )

In [None]:
print(f"Mean Reciprocal Rank: {(mean_reciprocal_rank * 100):.2f}%")
print(f"Mean NDCG: {(mean_ndcg * 100):.2f}%")

In [None]:
fig = px.histogram(
    x=reciprocal_ranks,
    marginal="box",
    title="Reciprocal Rank Distribution",
    labels={
        "x": "Reciprocal Rank"
    },
)

fig.show()

In [None]:
fig = px.histogram(
    x=ndcg_ranks,
    marginal="box",
    title="Reciprocal Rank Distribution",
    labels={
        "x": "Reciprocal Rank"
    },
)

fig.show()