In [None]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   585k      0  0:00:01  0:00:01 --:--:--  585k


In [None]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [None]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [None]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [None]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.


In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [None]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [None]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0113, 0.0490, 0.0059,  ..., 0.0466, 0.0288, 0.0363],
        [0.0429, 0.0443, 0.0231,  ..., 0.0466, 0.0053, 0.0321],
        [0.0369, 0.0491, 0.0149,  ..., 0.0498, 0.0132, 0.0177],
        ...,
        [0.0327, 0.0286, 0.0466,  ..., 0.0220, 0.0048, 0.0377],
        [0.0061, 0.0343, 0.0133,  ..., 0.0007, 0.0416, 0.0437],
        [0.0019, 0.0153, 0.0040,  ..., 0.0155, 0.0427, 0.0244]])
item_factors.weight tensor([[0.0396, 0.0033, 0.0470,  ..., 0.0472, 0.0312, 0.0138],
        [0.0406, 0.0140, 0.0185,  ..., 0.0437, 0.0317, 0.0488],
        [0.0365, 0.0248, 0.0419,  ..., 0.0010, 0.0312, 0.0467],
        ...,
        [0.0484, 0.0051, 0.0082,  ..., 0.0009, 0.0017, 0.0400],
        [0.0485, 0.0328, 0.0127,  ..., 0.0209, 0.0391, 0.0002],
        [0.0352, 0.0238, 0.0047,  ..., 0.0448, 0.0387, 0.0018]])


In [None]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.06142040436643
iter #1 Loss: 4.752550972294687
iter #2 Loss: 2.4803131901975815
iter #3 Loss: 1.7243283431844663
iter #4 Loss: 1.3479282990627481
iter #5 Loss: 1.1296374551989705
iter #6 Loss: 0.9924679120179966
iter #7 Loss: 0.9009102213806307
iter #8 Loss: 0.8376023857877944
iter #9 Loss: 0.7926196384097114
iter #10 Loss: 0.7596126019500838
iter #11 Loss: 0.7353814234815274
iter #12 Loss: 0.7161472966223199
iter #13 Loss: 0.7017507662400981
iter #14 Loss: 0.6909679986423042
iter #15 Loss: 0.682225996505488
iter #16 Loss: 0.6750951976264794
iter #17 Loss: 0.6696828761832968
iter #18 Loss: 0.6660284326615067
iter #19 Loss: 0.6628505627289036
iter #20 Loss: 0.6605681156749048
iter #21 Loss: 0.6593049400424594
iter #22 Loss: 0.6579624342585578
iter #23 Loss: 0.6569930058974905
iter #24 Loss: 0.6561820309628085
iter #25 Loss: 0.6553802055680207
iter #26 Loss: 0.6550440249709308
iter #27 Loss: 0.654397372591314
iter #28 Loss: 0.6532713818519854
iter #29 Loss: 0.65250250341

In [None]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[ 1.0626,  1.0279,  1.1392,  ...,  1.4879,  1.6676,  1.9111],
        [ 1.8682,  0.3223,  1.0806,  ...,  0.5388,  1.3861,  0.6749],
        [ 2.0456,  2.6139,  1.3974,  ...,  0.4882, -1.2330, -1.7219],
        ...,
        [ 2.3678,  0.9064,  1.3740,  ...,  0.4516,  0.0033,  1.7560],
        [ 1.3276,  0.6893,  0.7062,  ...,  1.5726,  1.2116,  1.2221],
        [ 1.2726,  1.4935,  1.2959,  ...,  0.4980,  0.7817,  1.6322]],
       device='cuda:0')
item_factors.weight tensor([[0.1713, 0.4374, 0.3920,  ..., 0.5359, 0.6083, 0.5074],
        [0.2015, 0.7019, 0.2399,  ..., 0.5348, 0.6458, 0.0737],
        [0.5729, 0.7095, 0.4786,  ..., 0.6023, 0.1331, 0.5590],
        ...,
        [0.3623, 0.3177, 0.3203,  ..., 0.2970, 0.3145, 0.3524],
        [0.4234, 0.4071, 0.3874,  ..., 0.4060, 0.4130, 0.3756],
        [0.4312, 0.4212, 0.4010,  ..., 0.4382, 0.4331, 0.3970]],
       device='cuda:0')


In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [None]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count().iloc[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Braveheart (1995)
	 True Lies (1994)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
	 Batman Forever (1995)
	 Dumb & Dumber (Dumb and Dumber) (1994)
	 Fifth Element, The (1997)
	 Natural Born Killers (1994)
	 Meet the Parents (2000)
	 Top Gun (1986)
	 Little Miss Sunshine (2006)
Cluster #1
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Jurassic Park (1993)
	 Terminator 2: Judgment Day (1991)
	 Toy Story (1995)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Apollo 13 (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Cluster #2
	 Home Alone (1990)
	 Net, The (1995)
	 Jumanji (1995)
	 Cliffhanger (1993)
	 Broken Arrow (1996)
	 Young Frankenstein (1974)
	 Honey, I Shrunk the Kids (1989)
	 Grease (1978)
	 Coneheads (1993)
	 Casper (1995)
Cluster #3
	 Lion King, The (1994)
	 Shrek (2001)
	 Dances with Wolves (1990)
	 Mrs. Doubtfire (1993)
	 Titanic (1997)
	 Pretty Wo