In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

--2020-02-16 15:29:46--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2020-02-16 15:29:46 (8.06 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]



In [3]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [4]:
import pandas as pd
from IPython.display import display

download_dir = './ml-latest-small'
links = pd.read_csv(download_dir + '/links.csv')
display(links.head())
tag = pd.read_csv(download_dir + '/tags.csv')
display(tag.head())
ratings = pd.read_csv(download_dir + '/ratings.csv')
display(ratings.head())
movies = pd.read_csv(download_dir + '/movies.csv')
movies.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


For this dataset, the movies are already clustered into genres, and the time of ratings are given. Both are useful information, the former can give a direction to building the movie embeddings, the latter can tell us how the user profile has changed over years. Therefore, to take into account of these factors, we assume rating is a function of user embedding (numeric representation of user profile), time, movie embedding and genres. To build a baseline model, we try to incorporate these ideas into the architecture.

In [5]:
# check if any user didnt give a rating
ratings.rating.isnull().value_counts()

False    100836
Name: rating, dtype: int64

In [6]:
# left join genres onto ratings table
df = ratings.join(movies.set_index('movieId'), on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [7]:
# turn timestamp from seconds to year
change_to_year = lambda x: pd.Timestamp(x, unit='s').year
df.timestamp = df.timestamp.apply(change_to_year)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,2000,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,2000,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,2000,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,2000,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,2000,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [8]:
# turn genres into vector representation
genres_list = df.genres.tolist()
genres = []

for genre in genres_list:
  genres.extend(genre.split('|'))

genres = set(genres)
print('unique genres: {}'.format(len(genres)))

unique genres: 20


In [0]:
# to numerically encode features and create mapping for them
genres_map = {}
for i, genre in enumerate(genres):
  genres_map[genre] = i

year_map = {}
year_set = set(df.timestamp.tolist())
for i, year in enumerate(year_set):
  year_map[year] = i

userId_map = {}
userId_set = set(df.userId.tolist())
for i, userId in enumerate(userId_set):
  userId_map[userId] = i

movieId_map = {}
movieId_set = set(df.movieId.tolist())
for i, movieId in enumerate(movieId_set):
  movieId_map[movieId] = i

rating_map = {}
rating_set = set(df.rating.tolist())
for i, rating in enumerate(rating_set):
  rating_map[rating] = i

In [73]:
# before defining our model, check the tensor dimensions
input_d = len(df.userId.unique()) + len(df.movieId.unique()) + len(genres) + len(year_set)
hidden_layer1_d = 500
output_d = len(df.rating.unique())

print(input_d, output_d)

10377 10


In [0]:
# define model architecture in pytorch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
  def __init__(self):
    super(Net, self).__init__()
    self.fc1 = nn.Linear(input_d, hidden_layer1_d)
    self.fc2 = nn.Linear(hidden_layer1_d, output_d)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    # normally, this is where softmax used for normalization, 
    # however, in pytorch, CrossEntropyLoss will do for us 
    return x


net = Net()

In [0]:
# prepare train, validation, test sets
X = df.iloc[:, [0, 1, 3, 5]]
y = df.iloc[:, 2]

import numpy as np
from sklearn.model_selection import train_test_split
# 70% train, 21% validation, 9% test
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.3, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_validation, y_validation, test_size=0.3, random_state=41)

In [0]:
# define optimizer
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

In [0]:
import torch

# generate tensor for pytorch model's input and output for SGD
def batch_generator(X, y, batch_size):
  for i in range(X.shape[0]//batch_size):
    batch_input = X.iloc[i*batch_size: i*batch_size + batch_size]
    batch_output = y.iloc[i*batch_size: i*batch_size + batch_size]

    tensor_input = []
    tensor_output = []
    for _, input_ in batch_input.iterrows():
      userId_vector = np.zeros(len(userId_map))
      userId_vector[userId_map[input_.userId]] = 1.
                    
      movieId_vector = np.zeros(len(movieId_map))
      movieId_vector[movieId_map[input_.movieId]] = 1.

      year_vector = np.zeros(len(year_map))
      year_vector[year_map[input_.timestamp]] = 1.

      genre_vector = np.zeros(len(genres_map))
      for g in input_.genres.split('|'):
        genre_vector[genres_map[g]] = 1.
      
      concat_vector = np.concatenate((userId_vector, movieId_vector, year_vector, genre_vector))
      tensor_input.append(concat_vector)

    for _, output in batch_output.iteritems():
      tensor_output.append(rating_map[output])
    
    tensor_input = torch.FloatTensor(tensor_input)
    # type long (64bit integer) for CrossEntropyLoss implementation
    tensor_output = torch.tensor(tensor_output, dtype=torch.long)

    yield tensor_input, tensor_output

In [0]:
# loop over the dataset multiple times
batch_size = 700

for epoch in range(3):

  running_loss = 0.0
  batch_no = 0
  for inputs, labels in batch_generator(X_train, y_train, batch_size):

    # zero the parameter gradients
    optimizer.zero_grad()

    # forward + backward + optimize
    outputs = net(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    # print statistics
    running_loss += loss.item()
    if batch_no % 10 == 9:
      print('[{}, {}] average_loss per batch: {}'.format(epoch+1, batch_no+1, running_loss/10))
      running_loss = 0.0
    
    batch_no += 1

print('Finished Training')

[1, 10] average_loss per batch: 2.3016562938690184
[1, 20] average_loss per batch: 2.2998040199279783
[1, 30] average_loss per batch: 2.297721195220947
[1, 40] average_loss per batch: 2.2959747314453125
[1, 50] average_loss per batch: 2.2952499389648438
[1, 60] average_loss per batch: 2.295028638839722
[1, 70] average_loss per batch: 2.292549896240234
