# RecSys

In [None]:
! pip install torch==1.8.0 catalyst[ml]==21.03

In [None]:
import numpy as np
import torch
from torch import nn, optim
from torch.nn import functional as F
from catalyst import dl, metrics, utils

### AUC

In [None]:
# AUC
outputs = torch.tensor([
    [0.9],
    [0.8],
    [0.7],
    [0.6],
    [0.5],
    [0.4],
    [0.3],
    [0.2],
    [0.1],
    [0.0],
])
targets=torch.tensor([
    [0],
    [1],
    [1],
    [1],
    [1],
    [1],
    [1],
    [0],
    [0],
    [0],
])
metrics.auc(outputs=outputs, targets=targets)

### Metrics

Our example will be this. We have 6 documents, and our model predict some order on it. For example, we gave some users to say how relevant were these documents. Model prediction is order, and human score is rel_score.

In [None]:
order = np.array([1, 2, 3, 4, 5, 6])
our_score = 1 / order # Higher score – higher raiting
rel_score = np.array([3, 2, 3, 0, 1, 2])


t_our_score = torch.tensor([our_score])
t_rel_score = torch.tensor([rel_score])

In [None]:
user_rel_score = rel_score // 3 # <-- only two documents are relevent for one user
user_t_rel_score = torch.tensor([user_rel_score])
print(f"New rel_score: {user_rel_score}")

In [None]:
t_our_score.shape, user_t_rel_score.shape

### MRR

[![logo](https://raw.githubusercontent.com/Scitator/sirius-ml/main/mrr.png)]()

In [None]:
metrics.mrr(
    outputs=user_t_rel_score,
    targets=t_our_score,
    topk=[1, 3],
)

In [None]:
metrics.mrr(
    outputs=torch.Tensor([
        [4.0, 2.0, 3.0, 1.0],
        [1.0, 2.0, 3.0, 4.0],
    ]),
    targets=torch.Tensor([
        [0, 0, 1.0, 1.0],
        [0, 0, 1.0, 1.0],
    ]),
    topk=[1, 3],
)

### HitRate

In [None]:
hitrate = user_rel_score.mean()
assert np.isclose(hitrate, 0.33333)
print(f"HitRate: {hitrate}")

In [None]:
print(f"HitRate: {metrics.hitrate(t_our_score, user_t_rel_score, topk=(1, 3, 5, 6))}")

### MAP

[![logo](https://raw.githubusercontent.com/Scitator/sirius-ml/main/map.png)]()

In [None]:
metrics.mean_average_precision(
    outputs=torch.tensor([
        [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
        [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
    ]),
    targets=torch.tensor([
        [1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0],
        [0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0],
    ]),
    topk=[10],
)

### DCG / NDCG

[![logo](https://raw.githubusercontent.com/Scitator/sirius-ml/main/dcg.png)]()

In [None]:
metrics.ndcg(
    outputs = torch.tensor([
        [0.0, 1.0, 2.0, 3.0],
    ]),
    targets = torch.Tensor([
        [0.0, 1.0, 2.0, 2.0],
    ]),
    topk=[4],
    gain_function="linear_rank",
)

In [None]:
metrics.ndcg(
    outputs = torch.tensor([
        [0.0, 2.0, 3.0, 1.0],
    ]),
    targets = torch.Tensor([
        [0.0, 1.0, 2.0, 2.0],
    ]),
    topk=[4],
    gain_function="linear_rank",
)

---

### Minimal Example

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from catalyst import dl

# sample data
num_users, num_features, num_items = int(1e4), int(1e1), 10
X = torch.rand(num_users, num_features)
y = (torch.rand(num_users, num_items) > 0.5).to(torch.float32)

# pytorch loaders
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=32, num_workers=1)
loaders = {"train": loader, "valid": loader}

# model, criterion, optimizer, scheduler
model = torch.nn.Linear(num_features, num_items)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, [2])

class CustomRunner(dl.Runner):
    def handle_batch(self, batch):
        x, y = batch
        logits = self.model(x)
        self.batch = {"features": x, "logits": logits, "scores": torch.sigmoid(logits), "targets": y}

# model training
runner = CustomRunner()
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    num_epochs=3,
    verbose=True,
    callbacks=[
        dl.CriterionCallback(input_key="logits", target_key="targets", metric_key="loss"),
        dl.AUCCallback(input_key="scores", target_key="targets"),
        dl.HitrateCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)),
        dl.MRRCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)),
        dl.MAPCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)),
        dl.NDCGCallback(input_key="scores", target_key="targets", topk_args=(1, 3, 5)),
        dl.OptimizerCallback(metric_key="loss"),
        dl.SchedulerCallback(),
        dl.CheckpointCallback(logdir="./logs", loader_key="valid", metric_key="map01", minimize=False),
    ]
)

### Movie Lens Dataset

In [None]:
from catalyst.contrib.datasets import MovieLens

train_dataset = MovieLens(root=".", train=True, download=True)
test_dataset = MovieLens(root=".", train=False, download=True)

In [None]:
from typing import Dict, Any, Sequence
from catalyst.utils import get_loader


def dist_transform(row: Dict[str, Any]) -> Dict[str, Any]:
    raitings = row["raitings"]
    movie_ids = torch.arange(raitings.size(0))[raitings > 0]
    user_ids = (
        torch.zeros_like(movie_ids).type(torch.LongTensor) + row["user_id"]
    )
    targets = (raitings[raitings > 0] / 5.0).type(torch.FloatTensor)
    return {
        "user_ids": user_ids, 
        "movie_ids": movie_ids, 
        "targets": targets
    }


def collate_fn(
    batch: Sequence[Dict[str, torch.Tensor]]
) -> Dict[str, torch.Tensor]:
    user_ids = torch.cat([b["user_ids"] for b in batch])
    movie_ids = torch.cat([b["movie_ids"] for b in batch])
    targets = torch.cat([b["targets"] for b in batch])
    return {
        "user_ids": user_ids, 
        "movie_ids": movie_ids, 
        "targets": targets
    }

In [None]:
user_indexes = torch.arange(len(train_dataset))

train_dataloader = get_loader(
    user_indexes,
    open_fn=lambda x: {"user_id": x, "raitings": train_dataset[x]},
    dict_transform=dist_transform,
    batch_size=1,
    num_workers=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

valid_dataloader = get_loader(
    user_indexes,
    open_fn=lambda x: {"user_id": x, "raitings": test_dataset[x]},
    dict_transform=dist_transform,
    batch_size=1,
    num_workers=4,
    shuffle=True,
    drop_last=True,
    collate_fn=collate_fn
)

### Funk SVD

In [None]:
import torch
import torch.nn as nn


class FunkSVD(nn.Module):
    def __init__(self, user_num: int, item_num: int, embedding_dim: int):
        super().__init__()
        
        self.user_embeddings = nn.Embedding(user_num, embedding_dim)
        self.item_embeddings = nn.Embedding(item_num, embedding_dim)
        
        self.user_bias = nn.Embedding(user_num, 1)
        self.item_bias = nn.Embedding(item_num, 1)
        
        self.bias = torch.nn.Parameter(torch.zeros(1))
        self.embedding_dim = embedding_dim
            
    def forward(
        self, user_ids: torch.Tensor, movie_ids: torch.Tensor
    ) -> torch.Tensor:
        user_embedding = self.user_embeddings(user_ids)
        user_bias = self.user_bias(user_ids).reshape(-1)
        item_embedding = self.item_embeddings(movie_ids)
        item_bias = self.item_bias(movie_ids).reshape(-1)
        dot = torch.einsum("oi,oj->o", user_embedding, item_embedding)
        output = dot + user_bias + item_bias + self.bias
        return output

In [None]:
model = FunkSVD(len(train_dataset), len(train_dataset[0]), 16)
optimizer = optim.Adam(model.parameters(), lr=1e-1)
criterion = nn.BCEWithLogitsLoss()

In [None]:
def custom_ndcg(logits, targets):
    sorted_indeces = torch.argsort(logits, descending=True)
    targets = torch.take(targets, sorted_indeces).reshape(1, -1)
    logits = torch.take(logits, sorted_indeces).reshape(1, -1)
    ndcg_values = metrics.ndcg(logits, targets, topk=(5, ))[0]
    return ndcg_values.item()

In [None]:
from pathlib import Path
from datetime import datetime

runner = dl.SupervisedRunner(input_key=["user_ids", "movie_ids"])

runner.train(
    model=model,
    optimizer=optimizer,
    loaders={"train": train_dataloader, "valid": valid_dataloader},
    criterion=criterion,
    logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S"),
    valid_loader="valid",
    valid_metric="loss",
    num_epochs=3,
    verbose=True,
    callbacks=[
        dl.FunctionalMetricCallback(
            input_key="logits", target_key="targets",
            metric_function=custom_ndcg, metric_name="custom_ndcg05"
        ),
        dl.OptimizerCallback(metric_key="loss", accumulation_steps=64),
        
    ]
)

---

### Neural Collaborative Filtering
Second method it's calculating user and item embeddings. To score user-item pair relevance, we aare going to concatinating vectors and pass forward through a neural network.

In [None]:
class NCF(nn.Module):
    def __init__(
        self, user_num: int, item_num: int, embedding_dim: int, hidden_dim: int
    ):
        super().__init__()

        self.user_embeddings = nn.Embedding(user_num, embedding_dim)
        self.item_embeddings = nn.Embedding(item_num, embedding_dim)

        self.layers = nn.Sequential(
            nn.Linear(2 * embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
        )

    def forward(
        self, user_ids: torch.Tensor, movie_ids: torch.Tensor
    ) -> torch.Tensor:
        user_embedding = self.user_embeddings(user_ids)
        item_embedding = self.item_embeddings(movie_ids)
        concat = torch.cat((user_embedding, item_embedding), -1)
        return self.layers(concat).view(-1)

In [None]:
model = NCF(len(train_dataset), len(train_dataset[0]), 64, 64)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-2)

In [None]:
runner = dl.SupervisedRunner(input_key=["user_ids", "movie_ids"])

runner.train(
    model=model,
    optimizer=optimizer,
    loaders={"train": train_dataloader, "valid": valid_dataloader},
    criterion=criterion,
    logdir=Path("logs") / datetime.now().strftime("%Y%m%d-%H%M%S"),
    num_epochs=3,
    valid_loader="valid",
    valid_metric="loss",
    verbose=True,
)