# Colaborative filteration

> Recsys algorithom from explicit reviews

In [None]:
#| default_exp collab

In [None]:
#| hide
from nbdev.showdoc import *
from fastcore.all import *
from fastprogress.fastprogress import progress_bar
from fastdownload import FastDownload
from fastai.tabular.all import *
from fastai.tabular.all import *
from fastai.collab import *

In [None]:
#|export
from fastprogress.fastprogress import progress_bar
import pandas as pd
import torch, torch.nn.functional as F
from torch import tensor
from fastai.learner import to_device, default_device
from fastcore.all import *

Load data from url

In [None]:
url = 'https://raw.githubusercontent.com/MenshikovDmitry/TSU_AI_Course/main/module_1.%20Recommender%2BDevOps/dataset/'
files = ('ratings_train.dat ratings_test.dat movies.dat users.dat').split()
d = FastDownload()

In [None]:
paths = L(d.download(url+f) for f in files); paths

(#4) [Path('/home/slakter/.fastdownload/archive/ratings_train.dat'),Path('/home/slakter/.fastdownload/archive/ratings_test.dat'),Path('/home/slakter/.fastdownload/archive/movies.dat'),Path('/home/slakter/.fastdownload/archive/users.dat')]

In [None]:
#|export
def read_movielens(ratings_path, movies_path):
    kw1 = dict(sep='::', names = ['userId','movieId','rating'], usecols=(0,1,2), engine='python')
    kw2 = kw1 | dict(names = ['movieId','title'], usecols=(0,1), encoding='ISO-8859-1')
    r, m = pd.read_csv(ratings_path, **kw1), pd.read_csv(movies_path, **kw2)
    return r.merge(m)

In [None]:
df, df_test = read_movielens(paths[0],paths[2]), read_movielens(paths[1],paths[2])
df.head()

Unnamed: 0,userId,movieId,rating,title
0,3539,2478,5,Three Amigos! (1986)
1,5795,377,2,Speed (1994)
2,3513,3526,4,Parenthood (1989)
3,4176,924,5,2001: A Space Odyssey (1968)
4,4657,1960,4,"Last Emperor, The (1987)"


### Ratings normalization

In [None]:
dls = CollabDataLoaders.from_df(df, item_name='title', bs=64,valid_pct=0.0)

In [None]:
df[['userId','title']].nunique()

userId    6040
title     3700
dtype: int64

## Baseline model

For baseline we can use basic user-item matrix. First convert it to dense matrix. <br>
Since number of items and users is small so we can manage it.

In [None]:
A = to_device(torch.sparse_coo_tensor(tensor(dls.xs.values).T,tensor(dls.ys.values).squeeze(),dtype=torch.float32).to_dense())

For user-based model to predict user score for movie, we compute <br>
$ v = A_uA^T$ to get user similarities by taking dot products, and <br>
$r = A^T_mv/\text{sum}(v)$

In [None]:
#|export
class SavePkl:
    def save(self,fname='./out.pkl'):
        Path(fname).parent.mkdir(parents=True,exist_ok=True)
        with open(fname,'wb') as f:
            save_pickle(f, self)
    def load(self,fname):
        with open(fname,'rb') as f:
            return load_pickle(f)

In [None]:
#|export
def normalize(df, users_means=None):
    if users_means is None: users_means = df.groupby('userId')['rating'].mean()
    df['rating'] -= df['userId'].map(users_means)
    return df, users_means


class CollabUserBased(SavePkl):
    def __init__(self, device=None): 
        self.device = ifnone(device, default_device())
    
    def fit(self, dls):
        ys, self.means = normalize(dls.xs.join(dls.ys))
        self.means = pd.concat([pd.Series([0]),self.means]) # fix indexing for 0 user
        xs, ys = tensor(dls.xs.values), tensor(ys['rating'].values)
        self.A =to_device(
            torch.sparse_coo_tensor(xs.T, ys.squeeze(),dtype=torch.float32).to_dense(),
            self.device)
    
    def predict(self, xb, yb=None,loss=F.mse_loss):
        means = self.means.values[xb[:,0]]
        u, m = xb.T
        ratings = torch.bmm((self.A[u] @ self.A.T)[:,None,:], self.A[:,m].T[...,None]).squeeze()/(self.A[u] @ self.A.T).sum(dim=1)
        if yb is not None: return (ratings, loss(ratings+means,yb[:,0]))
        return ratings+means

    def recommend(self, user:int, topk=3, filter_seen=True):
        m = self.A[:,A[user]==0] if filter_seen else self.A
        return ((self.A @ self.A[user]) @ m).topk(topk)

In [None]:
model = CollabUserBased()

In [None]:
model.fit(dls)
model.save('./models/model.pkl')

In [None]:
xb, yb = to_device(dls.one_batch())
model.predict(xb,yb)

(tensor([ 1.5100e-01,  1.1698e-01,  2.1150e-03, -9.1059e-02, -1.4822e-01,
          9.6155e-03,  3.2749e-01, -2.4297e-01, -1.1507e-01,  1.0507e-01,
          2.2940e-01,  2.6945e-01,  1.3217e-01,  3.3425e-01, -1.6082e-01,
         -1.7940e-02, -5.9592e-02,  1.5685e-01,  8.9849e-02,  6.7175e-01,
         -2.7132e-02, -7.7167e-02, -3.7284e-04,  9.8957e-02, -8.6543e-02,
          3.6333e-02,  9.7618e-02, -1.7878e-01,  4.6538e-01,  6.0477e-01,
          9.4387e-02,  2.4516e-01,  1.5680e-02,  3.0959e-01, -1.6205e-01,
         -8.7148e-02, -1.4799e-02,  7.0665e-02, -3.4731e-02, -2.0711e-01,
          3.7936e-04,  7.3856e-01,  1.4755e-01,  7.5081e-01,  6.0751e-03,
         -1.6065e-01,  6.7238e-01, -7.4030e-02, -2.3502e-02,  1.9748e-01,
          2.9628e-01,  9.5324e-03, -2.6456e-03,  1.4723e+00,  4.5638e-02,
          3.2118e-01,  3.2694e-02, -1.3012e-01,  3.4592e-01,  3.2639e-01,
          6.4173e-03, -4.8762e-02,  4.6511e-02,  1.8684e-01]),
 tensor(0.6619, dtype=torch.float64))

In [None]:
_, recs = model.recommend(xb[1][0], 10)
print('\n'.join(dls.classes['title'].map_ids(recs)))

Some Mother's Son (1996)
Price Above Rubies, A (1998)
Say Anything... (1989)
Seventh Heaven (Le Septième ciel) (1997)
Man Who Would Be King, The (1975)
Romeo and Juliet (1968)
Plan 9 from Outer Space (1958)
Roger & Me (1989)
Ghosts of Mississippi (1996)
Boys of St. Vincent, The (1993)


In [None]:
#|export
def train(model, dls, fname=None):
    model.fit(dls)
    model.save(fname)

def pred(model, dls, fname=None):
    if fname: model = model.load(fname)
    preds = [model.predict(*to_device(b)) for b in progress_bar(dls)]
    return torch.cat(preds)

def eval(model, dls, fname=None):
    if fname: model = model.load(fname)
    preds = [model.predict(*to_device(b)) for b in progress_bar(dls)]
    loss = torch.stack(list(L(preds).itemgot(1))).mean()
    return loss

## Test

In [None]:
test_dls = dls.test_dl(df_test,bs=1024)

In [None]:
loss = eval(model,test_dls)
loss

tensor(1.6232, dtype=torch.float64)

## Something smarter