In [1]:
# Source of Data <https://doi.org/10.1145/2827872>

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   488k      0  0:00:01  0:00:01 --:--:--  488k


In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [3]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [4]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
# Mapping movie ID to movie name
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.


In [8]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [9]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [10]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0086, 0.0285, 0.0458,  ..., 0.0404, 0.0085, 0.0148],
        [0.0298, 0.0022, 0.0087,  ..., 0.0421, 0.0431, 0.0059],
        [0.0263, 0.0480, 0.0302,  ..., 0.0058, 0.0292, 0.0203],
        ...,
        [0.0441, 0.0133, 0.0104,  ..., 0.0086, 0.0004, 0.0327],
        [0.0208, 0.0373, 0.0409,  ..., 0.0456, 0.0038, 0.0182],
        [0.0179, 0.0235, 0.0210,  ..., 0.0161, 0.0125, 0.0090]])
item_factors.weight tensor([[0.0347, 0.0404, 0.0125,  ..., 0.0059, 0.0387, 0.0009],
        [0.0252, 0.0483, 0.0413,  ..., 0.0094, 0.0033, 0.0492],
        [0.0142, 0.0260, 0.0361,  ..., 0.0406, 0.0045, 0.0179],
        ...,
        [0.0091, 0.0451, 0.0016,  ..., 0.0278, 0.0366, 0.0152],
        [0.0396, 0.0400, 0.0285,  ..., 0.0319, 0.0131, 0.0136],
        [0.0418, 0.0037, 0.0103,  ..., 0.0209, 0.0369, 0.0276]])


In [11]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.064060640819182
iter #1 Loss: 4.742170429169224
iter #2 Loss: 2.476766513991477
iter #3 Loss: 1.721715290383034
iter #4 Loss: 1.3458408728468843
iter #5 Loss: 1.1283088340069436
iter #6 Loss: 0.9913804695993511
iter #7 Loss: 0.9003277842314715
iter #8 Loss: 0.8373445263368829
iter #9 Loss: 0.7922144007637416
iter #10 Loss: 0.7596540358768502
iter #11 Loss: 0.7349660192725017
iter #12 Loss: 0.71645024993698
iter #13 Loss: 0.7016503918670156
iter #14 Loss: 0.690584895952704
iter #15 Loss: 0.6819333254837142
iter #16 Loss: 0.6750490952491155
iter #17 Loss: 0.6695502246091813
iter #18 Loss: 0.6659086892371855
iter #19 Loss: 0.662632593875609
iter #20 Loss: 0.6606847664653347
iter #21 Loss: 0.6591635885728797
iter #22 Loss: 0.6577462393998494
iter #23 Loss: 0.6564212696352586
iter #24 Loss: 0.6558655034467048
iter #25 Loss: 0.6549902021279795
iter #26 Loss: 0.6542389694658027
iter #27 Loss: 0.6536583265072198
iter #28 Loss: 0.6524677356411963
iter #29 Loss: 0.65117178751429

In [12]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[ 1.1796,  1.1227,  0.6548,  ...,  1.4071,  1.4386,  1.1584],
        [ 0.8429,  1.7350,  0.4649,  ...,  0.5693,  1.0014,  0.3668],
        [ 1.2277,  0.4411,  0.5359,  ..., -1.3116,  1.5567,  1.7422],
        ...,
        [ 1.1708, -0.2491,  0.0292,  ...,  1.0930,  0.6351,  2.2741],
        [ 1.2489,  0.6957,  1.3932,  ...,  0.5477,  0.3521,  1.2646],
        [ 0.3596,  1.4526,  1.1564,  ...,  1.0053,  1.3149,  1.6894]],
       device='cuda:0')
item_factors.weight tensor([[0.3254, 0.7725, 0.6811,  ..., 0.6421, 0.4234, 0.1646],
        [0.3746, 0.6343, 0.2337,  ..., 0.2197, 0.4071, 0.2420],
        [0.2693, 0.8418, 0.4491,  ..., 0.3132, 0.2627, 0.5736],
        ...,
        [0.3205, 0.3623, 0.3181,  ..., 0.3466, 0.3530, 0.3332],
        [0.4086, 0.4069, 0.3946,  ..., 0.3996, 0.3796, 0.3803],
        [0.4326, 0.3972, 0.4028,  ..., 0.4129, 0.4320, 0.4194]],
       device='cuda:0')


In [13]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [14]:
# unique movie factor weights
len(trained_movie_embeddings)

9724

In [15]:
# Fit the clusters based on the movie weights
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [16]:
'''Grouping movies into clusters.
It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''

for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Ace Ventura: Pet Detective (1994)
	 Mask, The (1994)
	 Pirates of the Caribbean: The Curse of the Black Pearl (2003)
	 Dumb & Dumber (Dumb and Dumber) (1994)
	 GoldenEye (1995)
	 Heat (1995)
	 Outbreak (1995)
	 Austin Powers: International Man of Mystery (1997)
	 Happy Gilmore (1996)
	 Ace Ventura: When Nature Calls (1995)
Cluster #1
	 Pulp Fiction (1994)
	 Star Wars: Episode IV - A New Hope (1977)
	 Terminator 2: Judgment Day (1991)
	 Schindler's List (1993)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Usual Suspects, The (1995)
	 American Beauty (1999)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Star Wars: Episode VI - Return of the Jedi (1983)
	 Godfather, The (1972)
Cluster #2
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Fight Club (1999)
	 Toy Story (1995)
	 Seven (a.k.a. Se7en) (1995)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
	 L