In [1]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19.

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  2700k      0 --:--:-- --:--:-- --:--:-- 2698k


In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

# Importing the Dataset

In [3]:
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [4]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [5]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.


In [8]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [9]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [10]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0167, 0.0183, 0.0185,  ..., 0.0199, 0.0330, 0.0259],
        [0.0357, 0.0057, 0.0471,  ..., 0.0108, 0.0460, 0.0005],
        [0.0292, 0.0365, 0.0269,  ..., 0.0355, 0.0297, 0.0447],
        ...,
        [0.0286, 0.0129, 0.0100,  ..., 0.0436, 0.0021, 0.0193],
        [0.0232, 0.0233, 0.0427,  ..., 0.0258, 0.0019, 0.0477],
        [0.0239, 0.0467, 0.0011,  ..., 0.0018, 0.0015, 0.0496]])
item_factors.weight tensor([[0.0356, 0.0265, 0.0091,  ..., 0.0445, 0.0402, 0.0291],
        [0.0161, 0.0226, 0.0490,  ..., 0.0156, 0.0126, 0.0448],
        [0.0311, 0.0180, 0.0394,  ..., 0.0263, 0.0221, 0.0030],
        ...,
        [0.0278, 0.0132, 0.0469,  ..., 0.0061, 0.0394, 0.0240],
        [0.0036, 0.0331, 0.0458,  ..., 0.0496, 0.0156, 0.0191],
        [0.0021, 0.0350, 0.0344,  ..., 0.0497, 0.0093, 0.0102]])


In [11]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.071763240141312
iter #1 Loss: 4.742762467582819
iter #2 Loss: 2.4723306062257833
iter #3 Loss: 1.7199356070327274
iter #4 Loss: 1.345199874663716
iter #5 Loss: 1.1280032859235851
iter #6 Loss: 0.9912991701497644
iter #7 Loss: 0.9000420444054047
iter #8 Loss: 0.8372136870314022
iter #9 Loss: 0.7923129628303692
iter #10 Loss: 0.7592413841846025
iter #11 Loss: 0.7345163448510437
iter #12 Loss: 0.7160386130365018
iter #13 Loss: 0.7013859867595779
iter #14 Loss: 0.6904164688841341
iter #15 Loss: 0.6819927851636398
iter #16 Loss: 0.6750073083085457
iter #17 Loss: 0.669682781519321
iter #18 Loss: 0.6657851479455904
iter #19 Loss: 0.6626385926519553
iter #20 Loss: 0.6607332745103667
iter #21 Loss: 0.6590502268166711
iter #22 Loss: 0.6575569079309551
iter #23 Loss: 0.6566143074465282
iter #24 Loss: 0.6557254129027957
iter #25 Loss: 0.6548196224845605
iter #26 Loss: 0.6544083246800501
iter #27 Loss: 0.6536516969276563
iter #28 Loss: 0.652725817928762
iter #29 Loss: 0.65129157401

# By training the model, we will have tuned latent factors for movies and users

In [12]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.8479,  0.7190,  1.1881,  ...,  0.6606,  1.5585,  1.3565],
        [ 0.3976,  1.5184,  1.2329,  ...,  0.3739,  0.5278,  2.7043],
        [ 0.9600, -0.1225,  0.8301,  ...,  2.6517, -1.0555,  2.4988],
        ...,
        [ 1.0579,  1.0609,  0.7147,  ...,  2.3086,  0.7932,  1.2395],
        [ 0.9341,  0.8857,  1.0715,  ...,  0.8833,  0.4583,  1.4067],
        [ 0.9668,  0.8398,  1.4858,  ...,  1.3060,  0.0497,  1.2059]],
       device='cuda:0')
item_factors.weight tensor([[0.1776, 0.3185, 0.7659,  ..., 0.2610, 0.5322, 0.5235],
        [0.2610, 0.2681, 0.3928,  ..., 0.1706, 0.1839, 0.7440],
        [0.2056, 0.4800, 0.6447,  ..., 0.5440, 0.1010, 0.5730],
        ...,
        [0.3476, 0.3320, 0.3669,  ..., 0.3252, 0.3649, 0.3430],
        [0.3739, 0.4042, 0.4185,  ..., 0.4209, 0.4106, 0.3894],
        [0.3696, 0.4044, 0.4035,  ..., 0.4192, 0.3314, 0.3782]],
       device='cuda:0')


In [13]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [14]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [15]:
from sklearn.cluster import KMeans


# Fit the clusters based on the movie weights

In [16]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [17]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Titanic (1997)
	 Ghost (1990)
	 Jumanji (1995)
	 Top Gun (1986)
	 Pocahontas (1995)
	 Grease (1978)
	 Sense and Sensibility (1995)
	 Legally Blonde (2001)
	 Miss Congeniality (2000)
	 Casper (1995)
Cluster #1
	 Toy Story (1995)
	 Aladdin (1992)
	 True Lies (1994)
	 Pirates of the Caribbean: The Curse of the Black Pearl (2003)
	 Mrs. Doubtfire (1993)
	 Finding Nemo (2003)
	 Pretty Woman (1990)
	 Dumb & Dumber (Dumb and Dumber) (1994)
	 Incredibles, The (2004)
	 Home Alone (1990)
Cluster #2
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Braveheart (1995)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Seven (a.k.a. Se7en) (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
Cluster #3
	 Batman & Robin (1997)
	 Free Willy (1993)
	 Godzilla (1998)
	 Super Mario Br