# Collaborative Filtering
Learning some recomendation systems. See Ch8 from FastAI book.

In [54]:

from fastai.collab import*
from fastai.tabular.all import*

In [55]:
path = untar_data(URLs.ML_100k)
path.ls()

(#23) [Path('/home/rotakagui/.fastai/data/ml-100k/u.user'),Path('/home/rotakagui/.fastai/data/ml-100k/u4.test'),Path('/home/rotakagui/.fastai/data/ml-100k/u.genre'),Path('/home/rotakagui/.fastai/data/ml-100k/ub.base'),Path('/home/rotakagui/.fastai/data/ml-100k/ua.base'),Path('/home/rotakagui/.fastai/data/ml-100k/ub.test'),Path('/home/rotakagui/.fastai/data/ml-100k/u.data'),Path('/home/rotakagui/.fastai/data/ml-100k/u4.base'),Path('/home/rotakagui/.fastai/data/ml-100k/u.info'),Path('/home/rotakagui/.fastai/data/ml-100k/u.occupation'),Path('/home/rotakagui/.fastai/data/ml-100k/u2.base'),Path('/home/rotakagui/.fastai/data/ml-100k/u2.test'),Path('/home/rotakagui/.fastai/data/ml-100k/u5.base'),Path('/home/rotakagui/.fastai/data/ml-100k/ua.test'),Path('/home/rotakagui/.fastai/data/ml-100k/u3.base'),Path('/home/rotakagui/.fastai/data/ml-100k/mku.sh'),Path('/home/rotakagui/.fastai/data/ml-100k/u3.test'),Path('/home/rotakagui/.fastai/data/ml-100k/u1.test'),Path('/home/rotakagui/.fastai/data/ml-

## Context of the problem
We would like to create a recomentation system for movies.

In [56]:
import pandas as pd
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None, names=['user','movie','rating','timestamp'])
ratings.head()


Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [57]:
## creating the DataLoaders
movies = pd.read_csv(path/'u.item', delimiter='|', encoding='latin-1',
 usecols=(0,1), names=('movie','title'), header=None)
movies.head()


Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [58]:
# merge with ratings 
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [59]:
# CollabDataLoaders.from_df: 
# item_name: You can specify which columns in your DataFrame correspond to users, items, and ratings. This is done using the item_name parameter to indicate which column should be treated as the item

dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,868,Toy Story (1995),4
1,290,Pretty Woman (1990),3
2,481,"Fish Called Wanda, A (1988)",5
3,253,Some Like It Hot (1959),5
4,313,101 Dalmatians (1996),4
5,344,Vertigo (1958),4
6,430,"English Patient, The (1996)",4
7,733,L.A. Confidential (1997),4
8,39,Contact (1997),4
9,75,Mission: Impossible (1996),4


In [60]:
# n_factors: the number of the length of the vector representing objects in our problem. A user's preferences will be represented by a vector of random numbers of length n_factors 

n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_factors = 5

user_factors = torch.randn(n_users,n_factors)
movies_factors = torch.randn(n_movies,n_factors)

In [61]:
# Why use a one-hot-encoded vectors? see pag 259
one_hot_3 = one_hot(3, n_users).float()
user_factors.t() @ one_hot_3

tensor([ 0.7221,  0.1318,  0.1138, -1.2589, -0.0203])

In [62]:
# should be equal to the user factor at index 3 
user_factors[3]


tensor([ 0.7221,  0.1318,  0.1138, -1.2589, -0.0203])

# Collaborative Filtering from Scratch

In [63]:
# I think Embedding is a function or class from FastAI library, idk. I ask perplexity and says: 
# - good for handling categorical variables | embeddings are implemented using PyTorch’s nn.Embedding module | then I guess it creates and Embedding given (number of items/something, number of factors we want to consider i.e. the length of our embedding)

class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        # we create our Embeddings, which at the moment are not more than random vectors per user/movie
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        # we also add some bias to account for some easy facts like quality movies >> low quality movies
        self.user_bias = Embedding(n_users, 1)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users*movies).sum(dim=1, keepdim=True)
        res+= self.user_bias(x[:,0])+self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)

In [64]:
# what `CollabDataLoaders.from_df().one_batch()` does? returns two things (input tensor: contains the user and item IDs for the currect batch, target tensor: contains the ratings associated with each user-item pair in the input tensor) 

x,y = dls.one_batch()
x.shape


torch.Size([64, 2])

In [65]:
model = DotProduct(n_users, n_movies, 50)
learn = Learner(dls,model, loss_func=MSELossFlat())
learn.fit_one_cycle(5,5e-3)

epoch,train_loss,valid_loss,time
0,0.880636,0.953936,00:03
1,0.560887,0.917384,00:03
2,0.412343,0.938249,00:03
3,0.32913,0.952104,00:03
4,0.299526,0.951951,00:03
