In [126]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
path = 'ml-latest-small/'

Data available from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [127]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [128]:
len(ratings)

100004

In [129]:
pd.read_csv(path+'movies.csv').head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [130]:
movie_names = pd.read_csv(path+'movies.csv').set_index('movieId')['title'].to_dict()

In [131]:
users = ratings.userId.unique()
movies = ratings.movieId.unique()

In [132]:
userid2idx = {o:i for i,o in enumerate(users)}
movieid2idx = {o:i for i,o in enumerate(movies)}

In [133]:
ratings.movieId = ratings.movieId.apply(lambda x: movieid2idx[x])
ratings.userId = ratings.userId.apply(lambda x: userid2idx[x])

In [134]:
min_rating,max_rating = ratings.rating.min(),ratings.rating.max()
min_rating,max_rating

(0.5, 5.0)

In [135]:
n_users = ratings.userId.nunique()
n_movies = ratings.movieId.nunique()
n_users, n_movies

(671, 9066)

In [136]:
n_factors = 50

In [137]:
def get_emb(ni,nf):
    e = nn.Embedding(ni, nf)
    e.weight.data.uniform_(-0.01,0.01)
    return e

In [138]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_users, n_movies, nh=10, p1=0.05, p2=0.5):
        super().__init__()
        (self.u, self.m) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors)]]
        self.lin1 = nn.Linear(n_factors*2, nh)
        self.lin2 = nn.Linear(nh, 1)
        self.drop1 = nn.Dropout(p1)
        self.drop2 = nn.Dropout(p2)
        
    def forward(self, users, movies):
        x = self.drop1(torch.cat([self.u(users),self.m(movies)], dim=2))
        x = self.drop2(F.relu(self.lin1(x)))
        return F.sigmoid(self.lin2(x)) * (max_rating-min_rating+1) + min_rating-0.5

In [139]:
wd=1e-5
model = EmbeddingNet(n_users, n_movies)
opt = optim.Adam(model.parameters(), 1e-3, weight_decay=wd)

In [140]:
from torch.utils.data import Dataset, DataLoader


In [141]:
class CFDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return int(self.users[idx]),int(self.movies[idx]), self.ratings[idx]

In [142]:
cf_dataset = CFDataset(users=ratings['userId'].values,movies=ratings['movieId'].values, ratings= ratings['rating'].values)

In [143]:
dataloader = DataLoader(cf_dataset, batch_size=64,
                        shuffle=True, num_workers=4)


In [144]:
for i_batch, sample_batched in enumerate(dataloader):
    users = Variable(sample_batched[0]).view(1,-1)
    movies = Variable(sample_batched[1]).view(1,-1)
    ratings = Variable(sample_batched[2].float()).view(1,-1)
    opt.zero_grad()
    outputs = model(users,movies)
    loss = F.mse_loss(outputs, ratings)
    loss.backward()
    opt.step()

In [145]:
model.m.weight 

Parameter containing:
-3.5361e-02  5.9013e-02 -3.4232e-02  ...  -3.2302e-02 -3.4754e-02 -3.4731e-02
 3.4802e-02 -3.6510e-02  2.2392e-02  ...   3.8491e-02  2.5802e-02  4.2828e-02
-1.0291e-02  6.3638e-03 -1.4508e-02  ...  -8.6190e-03 -4.3809e-03 -7.8954e-04
                ...                   ⋱                   ...                
 9.4708e-39  2.8954e-38 -7.8824e-38  ...   2.4492e-38  6.7143e-38 -3.0522e-38
-1.2531e-38  4.5348e-38  3.5669e-38  ...   4.4105e-38  5.4872e-38 -6.8133e-38
 1.8316e-39 -4.5915e-38 -6.2031e-38  ...  -6.0138e-38 -1.2675e-38 -4.3608e-38
[torch.FloatTensor of size 9066x50]