In [None]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1609k      0 --:--:-- --:--:-- --:--:-- 1610k


In [None]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data') #extracting all the files

In [None]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [None]:


print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [None]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict() #Mapping Movie IDs to Movie Names
n_users = len(ratings_df.userId.unique()) #Counting Unique Users and Movies
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------') #Calculating Matrix Sparsity
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.') #Calculates the percentage of the matrix that is filled
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")


Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module): #torch.nn.Module: Base class for all neural network modules.
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05) #Weight Initialization
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data): #Forward Pass: Defines how the input data flows through the network.
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1) #Multiplies the user and item embeddings element-wise
        #Sums the resulting vector to obtain a scalar rating prediction
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item) #Uses the forward method to make predictions for a given user-item pair


In [None]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy() #Initialize the dataset by copying the DataFrame to avoid altering the original data

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique() #Extract Unique User and Movie IDs
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)} #Since the original userId and movieId can be sparse or non-sequential, the code maps them to continuous indices.
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies (Reverse Mapping)
        self.idx2userid = {i:o for o,i in self.userid2idx.items()} #Allows easy lookup from the new index back to the original ID.
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x]) #Applying the Mappings to the DataFrame
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        #Prepare Input and Output Tensors
        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the Numpy array data to Pytorch tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index]) #Retrieves the x and y pair at the given index

    def __len__(self):
        return len(self.ratings) #Returns the total number of samples (rows) in the dataset

In [None]:
num_epochs = 128 #Number of times the model will go through the entire training dataset during training. In this case, the model will train for 128 epochs.
cuda = torch.cuda.is_available() #Checks if a GPU is available for computation

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8) #Creates an instance of the MatrixFactorization model
print(model)
for name, param in model.named_parameters(): #Printing Model Parameters
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda() #Moving the Model to GPU

# MSE loss
loss_fn = torch.nn.MSELoss() #Uses Mean Squared Error (MSE) as the loss function
''' Why MSE?
Since this is a regression problem (predicting a numerical rating), MSE is appropriate.
Measures the average squared difference between predicted ratings and actual ratings. '''

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) #Uses Adam (Adaptive Moment Estimation) as the optimization algorithm
''' Why Adam?
Combines the advantages of AdaGrad (adaptive learning rate) and Momentum (accelerates convergence).
Works well with sparse data and large models. '''
# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True) #Loads 128 samples per batch.



Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0371, 0.0147, 0.0260,  ..., 0.0092, 0.0235, 0.0116],
        [0.0206, 0.0311, 0.0472,  ..., 0.0280, 0.0041, 0.0246],
        [0.0353, 0.0470, 0.0456,  ..., 0.0264, 0.0194, 0.0345],
        ...,
        [0.0338, 0.0269, 0.0331,  ..., 0.0477, 0.0455, 0.0427],
        [0.0439, 0.0425, 0.0240,  ..., 0.0456, 0.0472, 0.0438],
        [0.0229, 0.0431, 0.0076,  ..., 0.0246, 0.0272, 0.0461]])
item_factors.weight tensor([[0.0200, 0.0313, 0.0149,  ..., 0.0445, 0.0496, 0.0107],
        [0.0081, 0.0446, 0.0262,  ..., 0.0302, 0.0181, 0.0341],
        [0.0103, 0.0492, 0.0016,  ..., 0.0495, 0.0108, 0.0342],
        ...,
        [0.0155, 0.0442, 0.0371,  ..., 0.0180, 0.0050, 0.0227],
        [0.0233, 0.0056, 0.0346,  ..., 0.0291, 0.0428, 0.0361],
        [0.0479, 0.0026, 0.0427,  ..., 0.0365, 0.0293, 0.0437]])


In [None]:
for it in tqdm(range(num_epochs)):
    losses = [] #Tracking Loss
    for x, y in train_loader: #Batch Processing
         if cuda: #Moving Tensors to GPU
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad() #Clears old gradients from the previous batch to prevent gradient accumulation
            outputs = model(x) #Forward Pass: Model Prediction
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32)) #Calculating Loss
            losses.append(loss.item())
            loss.backward() #Backward Pass: Computing Gradients
            '''Performs backpropagation to calculate the gradients of the loss with respect to the model parameters.'''
            optimizer.step() #Updating Model Parameters
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses)) #Printing Epoch Loss

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.06117087874921
iter #1 Loss: 4.74221169978834
iter #2 Loss: 2.4754281487259164
iter #3 Loss: 1.721752212283575
iter #4 Loss: 1.3458186462143351
iter #5 Loss: 1.1286316585117184
iter #6 Loss: 0.991340583577979
iter #7 Loss: 0.9002904997107947
iter #8 Loss: 0.8371035206287646
iter #9 Loss: 0.7921087224471387
iter #10 Loss: 0.759281872417116
iter #11 Loss: 0.7347838760254347
iter #12 Loss: 0.7160028906794369
iter #13 Loss: 0.7016340765704964
iter #14 Loss: 0.6908051801968347
iter #15 Loss: 0.6816917626839604
iter #16 Loss: 0.6751544955872038
iter #17 Loss: 0.6699497136972883
iter #18 Loss: 0.665809645605874
iter #19 Loss: 0.6629938700704405
iter #20 Loss: 0.6608284052631577
iter #21 Loss: 0.6588810834712183
iter #22 Loss: 0.6577847104916718
iter #23 Loss: 0.6569266470177525
iter #24 Loss: 0.6559341308958639
iter #25 Loss: 0.6554092317063192
iter #26 Loss: 0.6545724800319841
iter #27 Loss: 0.6537130082848713
iter #28 Loss: 0.6532881775785824
iter #29 Loss: 0.65205702779408

In [None]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()
'''This line of code is extracting the trained movie embeddings from the model and converting them to a NumPy array'''

NameError: name 'model' is not defined

In [None]:
len(trained_movie_embeddings) # number of unique movie factor weights

9724

In [None]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [None]:
'''It can be seen here that the movies that are in the same cluster tend to have
similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how
users have responded to the movie selections.'''
for cluster in range(10): #Loop through Clusters
    print("Cluster #{}".format(cluster))
    movs = []
    # Find movie indices belonging to the current cluster
    for movidx in np.where(kmeans.labels_ == cluster)[0]: #Identifying Movies in the Cluster
        movid = train_set.idx2movieid[movidx]
        # Check how many ratings this movie has
        rat_count = len(ratings_df.loc[ratings_df['movieId'] == movid]) #Counting the Number of Ratings
        movs.append((movie_names[movid], rat_count))
    # Sort movies by rating count in descending order and print top 10
    for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
        print("\t", mov[0])

Cluster #0
	 True Lies (1994)
	 Die Hard: With a Vengeance (1995)
	 Batman Forever (1995)
	 Clear and Present Danger (1994)
	 Firm, The (1993)
	 Armageddon (1998)
	 Shrek 2 (2004)
	 Back to the Future Part II (1989)
	 Ice Age (2002)
	 Top Gun (1986)
Cluster #1
	 Honey, I Shrunk the Kids (1989)
	 Johnny Mnemonic (1995)
	 Arachnophobia (1990)
	 Superman II (1980)
	 Terminator 3: Rise of the Machines (2003)
	 Godzilla (1998)
	 Beavis and Butt-Head Do America (1996)
	 Island of Dr. Moreau, The (1996)
	 Airheads (1994)
	 Superman III (1983)
Cluster #2
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Jurassic Park (1993)
	 Braveheart (1995)
	 Terminator 2: Judgment Day (1991)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Cluster #3
	 Pretty Woman (1990)
	 Cliffhanger (1993)
	 Nightmare Before C