In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable

In [2]:
# Importing the dataset
movies = pd.read_csv('ml-latest/movies.csv',encoding = 'latin-1')
users = pd.read_csv('ml-latest/tags.csv', encoding = 'latin-1')
ratings = pd.read_csv('ml-latest/ratings.csv', encoding = 'latin-1')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [22]:
users.userId.unique()

array([     1,     20,     49, ..., 270835, 270854, 270871])

In [23]:
ratings.userId.unique()

array([     1,      2,      3, ..., 270894, 270895, 270896])

In [None]:
### this is important - so there are SOME users that are in the ratings , but not in the users table
### so, not for all users and movies we have meta information


### for basic model we will go with the ratings only dataframe

In [14]:
ratings.head(30)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
5,1,1968,4.0,1425942148
6,1,2762,4.5,1425941300
7,1,2918,5.0,1425941593
8,1,2959,4.0,1425941601
9,1,4226,4.0,1425942228


In [6]:
# let us combine in one datastructure


In [7]:
print("min user id {},".format(min(users.userId)))
print("max user id {},".format(max(users.userId)))
print("total number of users {},".format(len(users.userId.unique())))

print("min movie id {},".format(min(movies.movieId)))
print("max movie id {},".format(max(movies.movieId)))
print("total number of movies {},".format(len(movies.movieId.unique())))

min user id 1,
max user id 270871,
total number of users 18052,
min movie id 1,
max movie id 176279,
total number of movies 45843,


In [56]:
## looking at the ratings table

In [55]:
print("min user id {},".format(min(ratings.userId)))
print("max user id {},".format(max(ratings.userId)))
print("total number of users {},".format(len(ratings.userId.unique())))

min user id 1,
max user id 270896,
total number of users 270896,


In [54]:
print("min movie id {},".format(min(ratings.movieId)))
print("max movie id {},".format(max(ratings.movieId)))
print("total number of movies {},".format(len(ratings.movieId.unique())))

min movie id 1,
max movie id 176275,
total number of movies 45115,


In [8]:
ratings.count()

userId       26024289
movieId      26024289
rating       26024289
timestamp    26024289
dtype: int64

In [9]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,26024290.0,26024290.0,26024290.0,26024290.0
mean,135037.1,15849.11,3.52809,1171258000.0
std,78176.2,31085.26,1.065443,205288900.0
min,1.0,1.0,0.5,789652000.0
25%,67164.0,1073.0,3.0,990754500.0
50%,135163.0,2583.0,3.5,1151716000.0
75%,202693.0,6503.0,4.0,1357578000.0
max,270896.0,176275.0,5.0,1501830000.0


In [24]:
userIdx_to_userId = {i: v for i, v in enumerate(ratings.userId.unique())}
userId_to_userIdx = {v: i for i, v in enumerate(ratings.userId.unique())}

In [25]:
movieIdx_to_movieId = {i: v for i, v in enumerate(ratings.movieId.unique())}
movieId_to_movieIdx = {v: i for i, v in enumerate(ratings.movieId.unique())}

In [34]:
print(movieIdx_to_movieId[14])

58559


In [29]:
# now let us create a list of tuples from ratings. 
# The tuple will contain (user_idx, movie_idx, rating)
def map_to_tuple(row):
    #print(row.userId)
    #print(row.movieId)
    # print('userId is {}'.format(row.userId))
    # print('movieId is {}'.format(row.movieId))
    # print(userId_to_userIdx[row.userId])
    # print(movieId_to_movieIdx[row.movieId])
    # print(userId_to_userIdx[row.userId],movieId_to_movieIdx[row.movieId], row.rating)
    return (userId_to_userIdx[row.userId],movieId_to_movieIdx[row.movieId], row.rating)

ratings["mapped"] = ratings.apply(map_to_tuple, axis=1)

In [30]:
ratings.head(15)

Unnamed: 0,userId,movieId,rating,timestamp,mapped
0,1,110,1.0,1425941529,"(0, 0, 1.0)"
1,1,147,4.5,1425942435,"(0, 1, 4.5)"
2,1,858,5.0,1425941523,"(0, 2, 5.0)"
3,1,1221,5.0,1425941546,"(0, 3, 5.0)"
4,1,1246,5.0,1425941556,"(0, 4, 5.0)"
5,1,1968,4.0,1425942148,"(0, 5, 4.0)"
6,1,2762,4.5,1425941300,"(0, 6, 4.5)"
7,1,2918,5.0,1425941593,"(0, 7, 5.0)"
8,1,2959,4.0,1425941601,"(0, 8, 4.0)"
9,1,4226,4.0,1425942228,"(0, 9, 4.0)"


In [31]:
# save the processed dataframe
ratings.to_csv("ratings_mapped.csv")

In [35]:
## now we have should split in test and train set
## first we have to create a list from "mapped" column and than do a split
## the question for future if I should co mini-batches 
## (since GPU seems to be not very efficient in case of a minibatch of a size one / I have to get back to this)

mapped_list = list(ratings.mapped)

In [36]:
import sys
sys.getsizeof(mapped_list)

234218712

In [37]:
from sklearn.cross_validation import train_test_split
train, test = train_test_split(mapped_list, test_size=0.2)



In [39]:
len(train)

20819431

In [46]:
train_users = set([p[0] for p in train])
train_items = set([p[1] for p in train])

In [59]:
test_users = set([p[0] for p in test])
test_items = set([p[1] for p in test])

In [60]:
n_users_train = len(train_users)
n_items_train = len(train_items)
print(n_users_train)
print(n_items_train)

269685
43276


In [61]:
n_users_test = len(test_users)
n_items_test = len(test_items)
print(n_users_test)
print(n_items_test)

252996
31759


In [64]:
## we want to test only on those users that are present in train
test_users_cleaned = train_users.intersection(test_users)
print(len(test_users))

252996


In [74]:
full_users = set([p[0] for p in mapped_list])
full_items = set([p[1] for p in mapped_list])

In [75]:
n_users_full = len(full_users)
n_items_full = len(full_items)

In [76]:
print(n_users_full, n_items_full)

270896 45115


In [77]:
#### Define a baby model
class BabyMatrixFactorization(torch.nn.Module):
    
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, 
                                               n_factors,
                                               sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, 
                                               n_factors,
                                               sparse=True)
        
    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [78]:
model = BabyMatrixFactorization(n_users_full, n_items_full, n_factors=20)

In [79]:
loss_func = torch.nn.MSELoss()

In [80]:
optimizer = torch.optim.SGD(model.parameters(), 
                            lr=1e-6) # learning rate

In [82]:
from tqdm import tqdm

In [81]:
## i have to shuffle the dataset
from random import shuffle

## now we want to converge
for user, item, rating in tqdm(train):
    # Turn data into variables
    rating_tensor = Variable(torch.FloatTensor([rating]))
    user_tensor = Variable(torch.LongTensor([user]))
    item_tensor = Variable(torch.LongTensor([item]))
    
    # Predict and calculate loss
    prediction = model(user_tensor, item_tensor)
    loss = loss_func(prediction, rating_tensor)
    
    # Backpropagate
    loss.backward()
    print(loss.item())
    
    # Update the parameters
    optimizer.step()

KeyboardInterrupt: 