In [1]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  2730k      0 --:--:-- --:--:-- --:--:-- 2729k


In [3]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:
# import the dataset in pandas dataframes
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [5]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)


The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [6]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [19]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print()
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')


Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.

Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.


In [9]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [10]:

# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [11]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0079, 0.0438, 0.0468,  ..., 0.0420, 0.0203, 0.0403],
        [0.0021, 0.0345, 0.0315,  ..., 0.0225, 0.0071, 0.0419],
        [0.0126, 0.0450, 0.0470,  ..., 0.0235, 0.0153, 0.0032],
        ...,
        [0.0492, 0.0135, 0.0316,  ..., 0.0162, 0.0438, 0.0148],
        [0.0275, 0.0219, 0.0471,  ..., 0.0047, 0.0321, 0.0143],
        [0.0045, 0.0023, 0.0146,  ..., 0.0115, 0.0363, 0.0158]])
item_factors.weight tensor([[0.0068, 0.0316, 0.0390,  ..., 0.0329, 0.0122, 0.0270],
        [0.0152, 0.0020, 0.0287,  ..., 0.0089, 0.0399, 0.0073],
        [0.0230, 0.0358, 0.0177,  ..., 0.0468, 0.0195, 0.0418],
        ...,
        [0.0169, 0.0163, 0.0078,  ..., 0.0446, 0.0427, 0.0055],
        [0.0009, 0.0328, 0.0347,  ..., 0.0304, 0.0133, 0.0131],
        [0.0207, 0.0133, 0.0274,  ..., 0.0206, 0.0223, 0.0238]])


In [12]:

for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.07149487945634
iter #1 Loss: 4.750512042021389
iter #2 Loss: 2.4744696110335704
iter #3 Loss: 1.719114196028201
iter #4 Loss: 1.344146840085233
iter #5 Loss: 1.1280099776795673
iter #6 Loss: 0.9908998334014476
iter #7 Loss: 0.899729831569691
iter #8 Loss: 0.8368278207058834
iter #9 Loss: 0.7920237999730909
iter #10 Loss: 0.7589400869033058
iter #11 Loss: 0.7345194089684994
iter #12 Loss: 0.7160703775090009
iter #13 Loss: 0.7013169712751045
iter #14 Loss: 0.6902428286027182
iter #15 Loss: 0.6815273666064147
iter #16 Loss: 0.6750170029510701
iter #17 Loss: 0.6699305339740013
iter #18 Loss: 0.6657630604535795
iter #19 Loss: 0.662897262211681
iter #20 Loss: 0.6607171286483706
iter #21 Loss: 0.6587227144217128
iter #22 Loss: 0.6574646734495453
iter #23 Loss: 0.656633270573495
iter #24 Loss: 0.6560810802097853
iter #25 Loss: 0.6549913567032306
iter #26 Loss: 0.6544280583558953
iter #27 Loss: 0.6534765183093584
iter #28 Loss: 0.6528373106103863
iter #29 Loss: 0.65161692135527

In [13]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.1519,  1.0616,  1.9129,  ...,  0.5455,  1.4869,  1.1634],
        [ 1.9143,  0.1113,  1.2723,  ...,  0.2296,  1.3327,  1.3353],
        [ 1.1614,  0.8308, -3.1670,  ...,  2.3842,  0.1839,  2.1631],
        ...,
        [ 0.6385,  0.6970,  2.3019,  ...,  1.8315,  0.4973,  1.8850],
        [ 1.0620,  0.8163,  0.7115,  ...,  0.5852,  1.6420,  0.6873],
        [ 1.1621,  0.6347,  1.0792,  ...,  0.5552,  0.7549,  0.8489]],
       device='cuda:0')
item_factors.weight tensor([[ 0.6553,  0.5762,  0.2425,  ...,  0.4405,  0.2760,  0.3028],
        [ 0.6423,  0.1915,  0.1754,  ...,  0.5887,  0.7023, -0.0915],
        [ 0.6808,  0.5005,  0.5436,  ...,  0.4864,  0.2408,  0.3093],
        ...,
        [ 0.3253,  0.3221,  0.3148,  ...,  0.3526,  0.3517,  0.3118],
        [ 0.3891,  0.4188,  0.4212,  ...,  0.4135,  0.4011,  0.3980],
        [ 0.3820,  0.3726,  0.3893,  ...,  0.3784,  0.3826,  0.3828]],
       device='cuda:0')


In [14]:

trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [15]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [16]:

from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [17]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []
    # Find movie indices belonging to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]:
        movid = train_set.idx2movieid[movidx]
        # Check how many ratings this movie has
        rat_count = len(ratings_df.loc[ratings_df['movieId'] == movid])
        movs.append((movie_names[movid], rat_count))
    # Sort movies by rating count in descending order and print top 10
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])

Cluster #0
	 Aladdin (1992)
	 Dances with Wolves (1990)
	 Beauty and the Beast (1991)
	 Mrs. Doubtfire (1993)
	 Titanic (1997)
	 Babe (1995)
	 E.T. the Extra-Terrestrial (1982)
	 Home Alone (1990)
	 Ferris Bueller's Day Off (1986)
	 Clueless (1995)
Cluster #1
	 Jurassic Park (1993)
	 Toy Story (1995)
	 Independence Day (a.k.a. ID4) (1996)
	 Apollo 13 (1995)
	 Fugitive, The (1993)
	 Batman (1989)
	 True Lies (1994)
	 Speed (1994)
	 Shrek (2001)
	 Men in Black (a.k.a. MIB) (1997)
Cluster #2
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Star Wars: Episode IV - A New Hope (1977)
	 Braveheart (1995)
	 Schindler's List (1993)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Usual Suspects, The (1995)
	 Seven (a.k.a. Se7en) (1995)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Cluster #3
	 Ghost (1990)
	 Net, The (1995)
	 Jumanji (1995)
	 Sleepless in Seattle (1993)
	 Cliffhanger (1993)
	 Prestige, The 