# Collaborative filtering

> Recsys algorithom from explicit reviews

In [2]:
#| default_exp collab

In [3]:
#| hide
from nbdev.showdoc import *
from fastcore.all import *
from fastprogress.fastprogress import progress_bar
from fastdownload import FastDownload
from fastai.tabular.all import *
from fastai.tabular.all import *
from fastai.collab import *

In [4]:
#|export
from fastprogress.fastprogress import progress_bar
import pandas as pd
import torch, torch.nn.functional as F
from torch import tensor
from fastai.collab import to_device, default_device, CollabDataLoaders
from fastcore.all import *

Load data from url

In [5]:
url = 'https://raw.githubusercontent.com/MenshikovDmitry/TSU_AI_Course/main/module_1.%20Recommender%2BDevOps/dataset/'
files = ('ratings_train.dat ratings_test.dat movies.dat users.dat').split()
d = FastDownload()

In [6]:
paths = L(d.download(url+f) for f in files); paths

(#4) [Path('/home/slakter/.fastdownload/archive/ratings_train.dat'),Path('/home/slakter/.fastdownload/archive/ratings_test.dat'),Path('/home/slakter/.fastdownload/archive/movies.dat'),Path('/home/slakter/.fastdownload/archive/users.dat')]

In [7]:
#|export
def read_movielens(ratings_path, movies_path):
    kw1 = dict(sep='::', names = ['userId','movieId','rating'], usecols=(0,1,2), engine='python')
    kw2 = kw1 | dict(names = ['movieId','title'], usecols=(0,1), encoding='ISO-8859-1')
    r, m = pd.read_csv(ratings_path, **kw1), pd.read_csv(movies_path, **kw2)
    return r.merge(m)

In [8]:
df, df_test = read_movielens(paths[0],paths[2]), read_movielens(paths[1],paths[2])
df.head()

Unnamed: 0,userId,movieId,rating,title
0,3539,2478,5,Three Amigos! (1986)
1,1358,2478,1,Three Amigos! (1986)
2,2565,2478,1,Three Amigos! (1986)
3,4819,2478,2,Three Amigos! (1986)
4,5763,2478,1,Three Amigos! (1986)


In [35]:
df[['userId','title']].nunique()

userId    6040
title     3700
dtype: int64

### Dataloaders

In [57]:
movie_map = CategoryMap(df.title)
user_map =  CategoryMap(df.userId)

In [93]:
xs = tensor([ user_map.map_objs(df.userId), movie_map.map_objs(df.title)]).T
ys = tensor(df.rating, dtype=torch.float32)

## Baseline model

For baseline we can use basic user-item matrix. First convert it to dense matrix. <br>
Since number of items and users is small so we can manage it.

In [94]:
A = to_device(torch.sparse_coo_tensor(xs.T,ys,dtype=torch.float32).to_dense())

For user-based model to predict user score for movie, we compute <br>
$v = A_uA^T$ to get user similarities by taking dot products, and <br>
$r = A^T_mv/\text{sum}(v)$

In [95]:
#|export
class SavePkl:
    def save(self,fname):
        Path(fname).parent.mkdir(parents=True,exist_ok=True)
        with open(fname,'wb') as f:
            save_pickle(f, self)
    def load(self,fname):
        with open(fname,'rb') as f:
            self.__dict__.update(load_pickle(f).__dict__)

In [74]:
#|export

class CollabUserBased(SavePkl):
    '''Basic model for collaborative filtering'''
    def __init__(self, device=None): 
        self.device = ifnone(device, default_device())
    
    def fit(self, xs, ys):
        A = to_device(torch.sparse_coo_tensor(xs.T,ys,dtype=torch.float32).to_dense())
        self.means = A.sum(dim=1)/A.count_nonzero(dim=1)
        self.A = A - self.means[:,None] # normalization

    def predict(self, xb, yb=None,loss=F.mse_loss):
        means = self.means[xb[:,0]]
        u, m = xb.T
        ratings = torch.bmm((self.A[u] @ self.A.T)[:,None,:], self.A[:,m].T[...,None]).squeeze()/(self.A[u] @ self.A.T).sum(dim=1)
        if yb is not None: return (ratings, loss(ratings+means,yb[:,0]))
        return ratings+means

    def recommend(self, movies: list, ratings: list, topk=5, filter_seen=True):
        user_emb = self.user_embed(movies,ratings)
        m = self.A[:,user_emb==0] if filter_seen else self.A
        return ((self.A @ user_emb) @ m).topk(topk)

    def user_embed(self, movies: list, ratings: list):
        ratings = tensor(ratings, dtype=torch.float32, device=self.device)
        emb = torch.zeros(self.A.shape[-1], device=self.device)
        emb[movies] = ratings - ratings.mean()
        return emb

    def similar_movies(self,movie_id: int, topk=5):
        return (self.A[:,movie_id] @ self.A).topk(topk+1).indices[1:]

In [75]:
model = CollabUserBased()

In [76]:
model.fit(dls)
model.save('./models/model.pkl')

In [77]:
dls.decode(model.similar_movies(5))

(#5) ['Raiders of the Lost Ark (1981)','Silence of the Lambs, The (1991)','2001: A Space Odyssey (1968)','Psycho (1960)','Basic Instinct (1992)']

In [78]:
model.A[1]

tensor([0., 0., 0.,  ..., 0., 0., 0.], device='cuda:0')

In [79]:
model.load('./models/model.pkl')

In [80]:
xb, yb = to_device(dls.one_batch())
model.predict(xb,yb)[1]

tensor(0.8640, device='cuda:0', dtype=torch.float64)

In [81]:
movies = [1,2,4,5]
ratings = [0,1,0,1]
_, recs = model.recommend(movies,ratings)
dls.classes['title'].map_ids(recs)

(#5) ['Radio Days (1987)','Alvarez Kelly (1966)','Shanghai Noon (2000)','Shower (Xizhao) (1999)','Scarlet Letter, The (1926)']

In [82]:
#|export
def train(model, dls, fname=None):
    model.fit(dls)
    model.save(fname)

def pred(model, dls):
    preds = [model.predict(*to_device(b)) for b in progress_bar(dls)]
    return torch.cat(preds)

def eval(model, dls):
    preds = [model.predict(*to_device(b)) for b in progress_bar(dls)]
    loss = torch.stack(list(L(preds).itemgot(1))).mean()
    return loss

## Test

In [83]:
test_dls = dls.test_dl(df_test,bs=100000)

In [84]:
loss = eval(model,test_dls)
loss

tensor(1.6255, device='cuda:0', dtype=torch.float64)

## Something smarter