# Colaborative filteration

> Recsys algorithom from explicit reviews

In [1]:
#| default_exp collab

In [1]:
#| hide
from nbdev.showdoc import *
from fastcore.all import *
from fastprogress import fastprogress
from fastdownload import FastDownload
from fastai.tabular.all import *
from fastai.tabular.all import *
from fastai.collab import *

In [2]:
#|export
import torch, torch.nn.functional as F
from torch import tensor
from fastai.learner import to_device

Load data from url

In [250]:
url = 'https://raw.githubusercontent.com/MenshikovDmitry/TSU_AI_Course/main/module_1.%20Recommender%2BDevOps/dataset/'
files = ('ratings_train.dat ratings_test.dat movies.dat users.dat').split()
d = FastDownload()

In [251]:
paths = [d.download(url+f) for f in files]; paths

[Path('/home/slakter/.fastdownload/archive/ratings_train.dat'),
 Path('/home/slakter/.fastdownload/archive/ratings_test.dat'),
 Path('/home/slakter/.fastdownload/archive/movies.dat'),
 Path('/home/slakter/.fastdownload/archive/users.dat')]

In [253]:
kwargs = dict(sep='::', names = ['userId','movieId','rating'], usecols=(0,1,2), engine='python')
ratings = pd.read_csv(paths[0], **kwargs)
ratings_test = pd.read_csv(paths[1],  **kwargs)
movies = pd.read_csv(paths[2], sep='::', names = ['movieId','title'], usecols=(0,1), engine='python', encoding='ISO-8859-1')

In [254]:
df, df_test = ratings.merge(movies), ratings_test.merge(movies)
df = df.drop_duplicates(subset = ['userId','title'])
df.head()

Unnamed: 0,userId,movieId,rating,title
0,3539,2478,5,Three Amigos! (1986)
1,1358,2478,1,Three Amigos! (1986)
2,2565,2478,1,Three Amigos! (1986)
3,4819,2478,2,Three Amigos! (1986)
4,5763,2478,1,Three Amigos! (1986)


### Ratings normalization

In [19]:
df['means'] = df.groupby('userId')['rating'].transform(lambda x: x.mean())
df['rating'] = df.groupby('userId')['rating'].transform(lambda x: x - x.mean())

In [20]:
dls = CollabDataLoaders.from_df(df, item_name='title', bs=64,valid_pct=0.0)

In [21]:
df[['userId','title']].nunique()

userId    6040
title     3700
dtype: int64

## Baseline model

For baseline we can use basic user-item matrix. First convert it to dense matrix. <br>
Since number of items and users is small so we can manage it.

In [22]:
A = torch.sparse_coo_tensor(tensor(dls.xs).T,tensor(dls.ys).squeeze(),dtype=torch.float32).to_dense().to('cuda')

For user-based model to predict user score for movie, we compute <br>
$ v = A_uA^T$ to get user similarities by taking dot products, and <br>
$r = A^T_mv/\text{sum}(v)$

In [268]:
#|export
class CollabUserBased:
    def __init__(self, A, device=None): 
        self.A = A
        if not device: to_device(self.A)
    
    def predict(self, xb, yb=None,loss=F.mse_loss):
        u, m = xb.T
        ratings = torch.bmm((self.A[u] @ self.A.T)[:,None,:], self.A[:,m].T[...,None]).squeeze()/(self.A[u] @ self.A.T).sum(dim=1)
        if yb is not None: return (ratings, loss(ratings,yb[:,0]))
        return ratings

    def recommend(self, user:int, topk=3):
        return ((self.A @ self.A[user]) @ self.A).topk(topk)

In [269]:
model = CollabUserBased(A)

In [270]:
xb, yb = to_device(dls.one_batch())
model.predict(xb,yb)

(tensor([ 0.1274, -0.1226,  0.1162,  0.2500,  0.1718,  0.0423, -0.3676, -0.0969,
          0.0009,  0.0088,  0.6426,  0.0373,  0.1635,  0.0742,  0.0211,  0.1517,
          0.0369,  0.1984, -0.0377, -0.0227,  0.1845,  0.0500,  0.0429, -0.3484,
         -0.0223,  0.0136, -0.1631, -0.1036,  0.2529,  0.0361, -0.0212,  0.3728,
          0.0634,  0.8883, -0.2202, -0.1393,  0.0068,  0.2256, -0.1129, -0.0577,
          0.1371,  0.1907,  0.1258,  0.0430, -0.0528,  0.1193,  0.0505, -0.0168,
         -0.0134,  0.4561,  0.0048, -0.0752, -0.1541, -0.0949, -0.1543,  0.2716,
          0.8590, -0.0770, -0.0194,  0.1488, -0.1002,  0.1175, -0.0505, -0.0448],
        device='cuda:0'),
 tensor(0.9882, device='cuda:0'))

In [276]:
_, recs = model.recommend(xb[0][0], topk=10)
print('\n'.join(dls.classes['title'].map_ids(recs)))

American Beauty (1999)
Fargo (1996)
Star Wars: Episode IV - A New Hope (1977)
Being John Malkovich (1999)
Usual Suspects, The (1995)
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)
L.A. Confidential (1997)
Pulp Fiction (1994)
Raiders of the Lost Ark (1981)
Star Wars: Episode V - The Empire Strikes Back (1980)


In [244]:
df_test = df_test.set_index('userId')

df_test['means']= df.drop_duplicates(subset='userId', keep='last').set_index('userId')['means']
df_test['rating'] -= df_test['means']
df_test = df_test.reset_index()

In [246]:
test_dls = dls.test_dl(df_test)

In [247]:
losses = [model.predict(*to_device(b))[1] for b in test_dls]

In [248]:
torch.stack(losses).mean()

tensor(1.6256, device='cuda:0')

## Something smarter