# Collaborative filtering

> Recsys algorithom from explicit reviews

In [None]:
#| default_exp collab

In [None]:
#| hide
from nbdev.showdoc import *
from fastdownload import FastDownload
from fastai.tabular.all import *
from fastai.collab import *

In [None]:
#|export
from fastprogress.fastprogress import progress_bar
import pandas as pd
import numpy as np
import torch, torch.nn.functional as F
from torch import tensor
from fastai.collab import to_device, to_cpu, default_device, CategoryMap, DataLoader
from fastcore.all import *

In [None]:
#|export
@patch
def pprint(l: L): print('\n'.join(l.map(str)))

class SavePT:
    '''Class to save and load PyTorch models or objects'''
    def save(self, fname: str|Path):
        '''Save the model to a file.'''
        Path(fname).parent.mkdir(parents=True,exist_ok=True)
        torch.save(self, fname)
    def load(self, fname: str|Path):
        '''Load the model from a file.'''
        obj = torch.load(fname, map_location=default_device())
        assert self.__class__ == obj.__class__, f"Class missmatch, wanted {self.__class__}, but file has {obj.__class__}"
        self.__dict__.update(obj.__dict__)

Load data from url

In [None]:
url = 'https://raw.githubusercontent.com/MenshikovDmitry/TSU_AI_Course/main/module_1.%20Recommender%2BDevOps/dataset/'
files = ('ratings_train.dat ratings_test.dat movies.dat users.dat').split()
d = FastDownload()

In [None]:
paths = L(d.download(url+f) for f in files); paths

(#4) [Path('/home/slakter/.fastdownload/archive/ratings_train.dat'),Path('/home/slakter/.fastdownload/archive/ratings_test.dat'),Path('/home/slakter/.fastdownload/archive/movies.dat'),Path('/home/slakter/.fastdownload/archive/users.dat')]

In [None]:
#|export
def read_movielens(ratings_path: str, movies_path: str) -> pd.DataFrame:
    """
    Reads the MovieLens dataset from the given ratings and movies files and merges them based on the movieId.
    """
    kw1 = dict(sep='::', names=['userId', 'movieId', 'rating'], usecols=(0, 1, 2), engine='python')
    kw2 = kw1 | dict(names=['movieId', 'title'], usecols=(0, 1), encoding='ISO-8859-1')
    r, m = pd.read_csv(ratings_path, **kw1), pd.read_csv(movies_path, **kw2)
    return r.merge(m)

In [None]:
df, df_test = read_movielens(paths[0],paths[2]), read_movielens(paths[1],paths[2])
df.head()

Unnamed: 0,userId,movieId,rating,title
0,3539,2478,5,Three Amigos! (1986)
1,1358,2478,1,Three Amigos! (1986)
2,2565,2478,1,Three Amigos! (1986)
3,4819,2478,2,Three Amigos! (1986)
4,5763,2478,1,Three Amigos! (1986)


In [None]:
df[['userId','title']].nunique()

userId    6040
title     3700
dtype: int64

### Dataloaders

In [None]:
#|export
class TfmdDataset(SavePT):
    '''Dataset with mapped usres and movies'''
    def __init__(self, df, movie_map = None, user_map=None):
        self.movie_map = ifnone(movie_map,CategoryMap(df.title))
        self.user_map = ifnone(user_map,CategoryMap(df.userId))
        self.xs = tensor([self.user_map.map_objs(df.userId), self.movie_map.map_objs(df.title)]).T
        if hasattr(df, 'rating'): self.ys = tensor(df.rating, dtype=torch.float32)
    def encode(self, movies): return self.movie_map.map_objs(movies)
    def decode(self, movie_ids): return self.movie_map.map_ids(movie_ids)
    def __getitem__(self,i): 
        return (self.xs[i],self.ys[i]) if hasattr(self,'ys') else (self.xs[i],)
    def __len__(self): return len(self.xs)
    @delegates(DataLoader.__init__)
    def dls(self, bs=64, **kwargs):
        '''Create a DataLoader with the given batch size'''
        return DataLoader(self, bs=bs, **kwargs)
    
    def test_ds(self, test_df): 
        '''Create a test dataset with the given DataFrame'''
        return self.__class__(test_df, self.movie_map,self.user_map)

In [None]:
ds = TfmdDataset(df)
ds_test = ds.test_ds(df_test)

## Baseline model

For baseline we can use basic user-item matrix. First convert it to dense matrix. <br>
Since number of items and users is small so we can manage it.

In [None]:
A = to_device(torch.sparse_coo_tensor(ds.xs.T,ds.ys,dtype=torch.float32).to_dense())

For user-based model to predict user score for movie, we compute <br>
$v = A_uA^T$ to get user similarities by taking dot products, and <br>
$r = A^T_mv/\text{sum}(v)$

In [None]:
#|export
class CollabUserBased(SavePT):
    '''Basic model for collaborative filtering'''
    def __init__(self, device=None): 
        self.device = ifnone(device, default_device())
    
    def norm(self, x, m, std=None): 
        return (x-m)/std if std is not None else (x-m)/m
    def denorm(self, x, m, std=None): return x*std+m if std is not None else x*m+m
    
    def fit(self, ds):
        '''Fit the model to the given dataset'''
        A = to_device(torch.sparse_coo_tensor(ds.xs.T, ds.ys, dtype=torch.float32).to_dense())
        
        # little trick to use methods list nanmean and nanstd
        A[A==0] = torch.nan
        self.means = A.nanmean(dim=1)
        self.std = tensor(np.nanstd(to_cpu(A), 1), device=self.device)
        A = self.norm(A, self.means[:,None], self.std[:,None])
        # get zeros back
        A[A.isnan()] = 0
        self.A = A

    def predict(self, xb, yb=None):
        '''Predict the ratings for batch and calculate the loss if yb is given'''
        u, m = xb.T
        u, m = self.A[u], self.A[:,m].T
        # cosine similarity
        u /= u.norm(dim=1)[:,None]
        normed = (self.A/self.A.norm(dim=1)[:,None]).T
        ratings = torch.bmm((u @ normed)[:,None,:], m[...,None]).squeeze()/torch.count_nonzero(m, dim=1)**0.5
        ratings = self.denorm(ratings,  self.means[xb.T[0]], self.std[xb.T[0]])
        if yb is not None: return (ratings, F.mse_loss(ratings,yb))
        return ratings

    def recommend(self, movies: tensor, ratings: tensor, topk: int=5, filter_seen=True):
        '''Recommend topk movies based on the given ratings. \n
        If filter_seen is True, the movies that are already rated will be filtered out'''
        u = self.user_embed(movies, ratings)
        # res = self.denorm(((self.A @ u) @ self.A)/(self.A!=0).sum(0), ratings.mean()) works for ratings but not for recommendations
        res = self.denorm(((self.A @ u) @ self.A), ratings.mean())
        if not filter_seen: return res.topk(topk)
        res = res.topk(topk + len(movies))
        mask = ~torch.isin(res.indices,movies)
        return (res[0][mask][:topk], res[1][mask][:topk])

    def user_embed(self, movies: tensor, ratings: tensor):
        emb = torch.zeros(self.A.shape[-1], device=self.device)
        emb[movies] = self.norm(ratings, ratings.mean())
        return emb

    def similar_movies(self, movie_id: int, topk=5):
        '''Return topk similar movies to the given movie_id'''
        return (self.A[:,movie_id].squeeze(-1) @ self.A).topk(topk+1).indices[1:]

In [None]:
model = CollabUserBased()
model.fit(ds)

In [None]:
model.save('./models/model.pt')
ds.save('./models/ds.pt')

In [None]:
dls = ds.dls()
xb, yb = to_device(dls.one_batch())

In [None]:
model.predict(xb,yb)[1]

tensor(1.3795, device='cuda:0')

In [None]:
movies = tensor([ds.encode([s])[0] for s in ds.movie_map if 'star wars' in s.lower()] + [1, 2, 3], device=model.device)[2:]
ratings = tensor([5] * (len(movies) - 3) + [1] * 3, device=model.device, dtype=torch.float)

In [None]:
ds.decode(movies).pprint()

Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode VI - Return of the Jedi (1983)
'Night Mother (1986)
'Til There Was You (1997)
'burbs, The (1989)


In [None]:
ds.decode(model.recommend(movies,ratings,10)[1]).pprint()

Star Wars: Episode IV - A New Hope (1977)
Raiders of the Lost Ark (1981)
Matrix, The (1999)
Saving Private Ryan (1998)
Shawshank Redemption, The (1994)
Princess Bride, The (1987)
Sixth Sense, The (1999)
Braveheart (1995)
Indiana Jones and the Last Crusade (1989)
Godfather, The (1972)


In [None]:
m_id = 3149
ds.decode([m_id])

(#1) ['Star Wars: Episode V - The Empire Strikes Back (1980)']

In [None]:
ds.decode(model.similar_movies(m_id)).pprint()

Star Wars: Episode IV - A New Hope (1977)
Star Wars: Episode VI - Return of the Jedi (1983)
Raiders of the Lost Ark (1981)
Matrix, The (1999)
Godfather, The (1972)


## Main service

In [None]:
#|export
class ModelService:
    '''Service class for model training, evaluation and predictions. It also provides methods for saving and loading the model.'''
    def __init__(self, model: CollabUserBased=None, ds=None):
        self.model = model
        self.ds = ds
    def _movie_enc(self, movies): 
        return tensor(self.ds.encode(movies) if isinstance(movies[0],str) else movies, device=self.model.device)
    
    def save(self, dir):
        dir = Path(dir)
        self.ds.save(dir/'ds.pt')
        self.model.save(dir/'model.pt')
    
    @classmethod
    def load(cls, dir, model):
        dir = Path(dir)
        model.load(dir/'model.pt')
        ds = torch.load(dir/'ds.pt')
        return cls(model, ds)
        
    def train(self, ds=None, model = None):
        '''Train model from scratch on dataset'''
        self.model = ifnone(model, self.model)
        ds = ifnone(ds,self.ds)
        self.model.fit(ds)
    
    def pred(self, ds=None, bs=8192):
        '''Get rating predictions for dataset'''
        dls = ifnone(ds,self.ds).dls(bs)
        preds = torch.cat([self.model.predict(*to_device(b, self.model.device))[0] for b in progress_bar(dls)])
        return preds.tolist()

    def eval(self, ds=None, bs=8192):
        '''Evaluate RMSE for dataset'''
        dls = ifnone(ds,self.ds).dls(bs)
        loss = torch.stack([self.model.predict(*to_device(b, self.model.device))[1]*len(b[0]) for b in progress_bar(dls)]).mean()
        return torch.sqrt(loss/len(ds)).item()

    def recommend(self, movies: list, ratings: list, topk=5, filter_seen=True):
        '''Recommend top k movies by user wih list of movies and ratings'''
        movies = self._movie_enc(movies)
        ratings = tensor(ratings, device=self.model.device, dtype=torch.float)
        return self.ds.decode(self.model.recommend(movies, ratings, topk, filter_seen)[1])

    def similar_movies(self, movie:str, topk=5):
        '''Find top k similar movies'''
        movie = self._movie_enc([movie])
        ms = self.model.similar_movies(movie, topk)
        return self.ds.decode(ms)

## Measure metrics

In [None]:
serv = ModelService.load('./models', CollabUserBased())

In [None]:
#|eval: false
serv.eval(ds_test, bs=4096*4)

1.025577187538147

In [None]:
#|eval: false
serv.pred(ds_test)[0:5]

[3.2293689250946045,
 3.1532304286956787,
 3.6166019439697266,
 3.4224424362182617,
 3.27913236618042]