In [1]:
# Data Citation
# F. Maxwell Harper and Joseph A. Konstn. 2015. The MovieLens Datasets: History and Context. 
! curl https://grouplens.org/datasets/movielens/latest

In [2]:
import os
print(os.getcwd())


/Users/abosede/PROJECTS/DATA SCIENCE/RECOMMENDER SYSTEMS/Customer-focused-movie-recommender-systems


In [3]:
# This is to unzip our file
import zipfile
with zipfile.ZipFile('/Users/abosede/PROJECTS/DATA SCIENCE/RECOMMENDER SYSTEMS/Customer-focused-movie-recommender-systems/ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [4]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('/Users/abosede/PROJECTS/DATA SCIENCE/RECOMMENDER SYSTEMS/Customer-focused-movie-recommender-systems/data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('/Users/abosede/PROJECTS/DATA SCIENCE/RECOMMENDER SYSTEMS/Customer-focused-movie-recommender-systems/data/ml-latest-small/ratings.csv')

In [5]:
print('The dimensions of the movies dataframe are:', movies_df.shape, '\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of the movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [6]:
# A quick look at the movies data frame (movies_df)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
# Quick look at the ratings data frame
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
# This is to create a Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
# From the data frame this brings out "movieId" as the index column. Then the title is what to be returned
# The generated series is then converted to a dictionary instead, that is the movie id and title become key - value mappings

n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

n_users_1 = len(ratings_df["userId"].unique())
print("Number of unique users:", n_users)
print("Number of unique movies", n_items)
print("Number of unique users (2):", n_users_1)

print("The full rating matrix will have:", n_users*n_items, 'elements.')

print('----------')
print('Number of ratings:', len(ratings_df))
print("Therefore: ", len(ratings_df)/ (n_users * n_items) * 100, '% of the matrix is filled.')

print('We have an incredibly sparse matrix to work with here')
print('We can imagine as the number of users and products grow, the number of elements will increase by n*2')
print('We are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.')
print('One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don\'t need all the data')


Number of unique users: 610
Number of unique movies 9724
Number of unique users (2): 610
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here
We can imagine as the number of users and products grow, the number of elements will increase by n*2
We are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [None]:
# This is to install the needed libraries -> We would only be using pytorch here
%pip install torch torchvision torchaudio
# torch -> the core Pytorch library
# torchvision -> for image-related tasks (like datasets and transforms)
# torchaudio -> for audio processing (if needed!)


In [10]:
%pip install tqdm # This is for progress bars in

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [11]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm  #as tqdm
# tqdm is a Python library that adds progress bars to loops - super userful for tracking long-running tasks
# tqdm_notebook : This is a variant that displays the progess bar nicely inside a Jupyter Notebook interface

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors = 20):
        super().__init__() # This is to set up our parent class -> "torch.nn.Module"
        
        # For the embedding layer
        '''
            This creates a lookup table that maps each user ID to a dense vector of size "n_factors"ArithmeticError
            n_users: This is is the number of unique users - each one gets its own vector
            n_factors: The size of each embedding vector - this defines the dimensionality of the latent space

            So, if we have 1000 users and "n_factors = 20", this layer will learn a "1000 x 20" matrix during training

            # Similar users will have similar embeddings and then similar items will have similar embeddings
        '''
        # Create the user embedding layer -> This would later be called on the actual data
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # Think of this as a look up table for the input

        # Create the item embedding layer
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # Think of this as a lookup table for the input

        # Further explanation
        '''
        .weight -> This accesses the actual weight matrix - the lookup table of user embeddings
        .data -> Refers to the raw tensor data (bypassing autograd tracking)
        .uniform_(0,0.05) -> Fills the tensor with random values uniformly distributed between 0 and 0.05
        '''
        # Why???
        '''
         Initially, PyTorch initializes weights randomly, but you can override that to control the range
         Initializing with small values like this can help:
            Prevent exploding gradients early in training
            Ensure embeddings start close to zero, which can stabilize learning
            Avoid biasing the model toward any particular direction before training begins
        '''
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0,0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]

        # Note
        '''
            data[:,0] -> This grabs all rows in the first column
        '''
        # Multiply the matrices
        return (self.user_factors(users) * self.item_factors(items)).sum(1)

        '''
            self.user_factors(users) -> This retrieves the embedding vector for each userID in the users tensor
            * -> This multiplies each user vector with its corresponding item vector — element by element. Still shape: [batch_size, n_factors]
            .sum(1) -> This sums across the embedding dimensions (n_factors) for each user-item pair, producing a single scalar score per pair. Final shape: [batch_size]
        '''
    
    def predict(self, user, item):
        return self.forward(user, item)
    
    

In [12]:
# Crearing the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # This is the package that helps transform the data to machine learning readiness

# Note: This is not good practice, in a MLOps sense but we'll roll with this since the data is already loaded in memory
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        # Producing new continuous IDs for users and movies --

        # This is like forward mapping (That is, creating a mapping for the movieID and the actual ID)
        # Unique values : index
        self.userid2idx  = {o:i for i, o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        # Then to map backwards -> That is, 
        self.idx2userid = {i:o for o, i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as nootes in the lambda function down below
        self.ratings.movieId = ratings_df.movieId.apply(lambda x : self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        # Then to classify the predictor and target!
        self.x = self.ratings.drop(['rating', 'timestamp'], axis =1).values
        self.y = self.ratings['rating'].values

        # Then to convert the columns into model ready columns
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)


    '''
        These are helper functions
    '''

    # This is to get the item (From the tensor) -> Given an index
    # How this function can be used
    # Say we instantiate like : loader = Loader()
    # Then we can have: item = loader[index]
    def __getitem__(self, index):
        return (self.x[index], self.y[index]) 
    
    # How can this be called?? 
    # Say we have the same above, then we can call it like
    # len(loader) -> This will directly trigger this function
    def __len__(self):
        return len(self.ratings)




In [13]:
num_epochs = 128 # This sets the number of training epochs — how many times your model will see the entire dataset during training.
cuda = torch.cuda.is_available() # This checks if a CUDA-compatible GPU is available on your system.
# CUDA is NVIDIA’s parallel computing platform that allows PyTorch to run operations on the GPU, which is much faster than CPU for deep learning tasks.

print("Is running on GPU:", cuda)


# Then to instantiate our model
model = MatrixFactorization(n_users, n_items, n_factors = 8)

print(model) # this prints out the super class's __str__ function

'''
".named_parameters()" -> comes from the "nn.Module" class - from which "MatrixFactorization" is inheriting from
.named_parameters() -> This returns an iterator over (name, parameter) pairs for all parameters in the model that have "requires_grad=True" 
These parameters are typically the weights and biases of the layers we have defined
In this case
The model has:
user_factors: nn.Embedding(610,8) layer
item_factors: nn.Embedding(9724,8) layer

Each of these layers has a ".weight" tensor - that's what we are accessing

## param.requires_grad
-- This checks whether the parameter should be updated during backpropagation. If "True", it means the parameter is learnable - gradients will be computed for it and it will be updated during training
'''
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

# GPU enable if we have one! -> In this example, I do not have so I am not going to be doing that
if cuda:
    model = model.cuda()


# MSE loss
# This is the mean square error loss
# This calculates how far off your model's predictions are from the actual values
# The goal is to minimize this loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
'''
Adam stands for Adaptive Moment Estimation — it’s a popular optimizer that combines the benefits of SGD and RMSProp.
It adjusts learning rates for each parameter individually based on estimates of first and second moments of the gradients.
model.parameters() tells it which weights to update — in your case, the embeddings for users and items.
lr=1e-3 sets the learning rate, which controls how big each update step is. A smaller value means slower but more stable learning.
'''
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)


# Train the data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle = True)

# What happens under the hood
'''
    Calls __len__() to know how many samples are in train_set.
    Randomly shuffles the indices (if shuffle=True).
    Uses __getitem__() to fetch 128 samples at a time (your batch size).

    1. __len__(self)
        Returns the total number of samples in your dataset.

        Used to determine how many batches to create.

    2. __getitem__(self, index)
        Returns a single data sample (and optionally its label) at the given index.

        This is what gets batched together during training.
'''

# Note :
# This DataLoader(., ., .) returns the batches of data that you can loop through in training






Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[2.0839e-02, 4.1836e-02, 3.3281e-02,  ..., 4.2128e-04, 1.2003e-02,
         2.9348e-02],
        [1.8641e-02, 1.3165e-02, 1.2323e-02,  ..., 4.7656e-02, 4.1534e-03,
         4.6812e-03],
        [1.8489e-02, 8.4038e-03, 3.6098e-02,  ..., 3.5817e-02, 4.5381e-02,
         1.9939e-02],
        ...,
        [4.9437e-02, 3.6056e-02, 2.1113e-02,  ..., 9.9880e-05, 2.5980e-04,
         4.7079e-02],
        [3.8748e-02, 4.1664e-02, 3.4890e-03,  ..., 2.0833e-02, 2.7234e-02,
         4.6142e-02],
        [4.8907e-02, 1.9885e-02, 5.5604e-03,  ..., 2.9974e-02, 2.8602e-03,
         3.9468e-02]])
item_factors.weight tensor([[0.0103, 0.0019, 0.0363,  ..., 0.0275, 0.0189, 0.0368],
        [0.0316, 0.0391, 0.0291,  ..., 0.0018, 0.0110, 0.0106],
        [0.0116, 0.0109, 0.0328,  ..., 0.0350, 0.0366, 0.0275],
        ...,
        [0.0378, 0.0313, 0.0192,  ...,

'\n    Calls __len__() to know how many samples are in train_set.\n    Randomly shuffles the indices (if shuffle=True).\n    Uses __getitem__() to fetch 128 samples at a time (your batch size).\n\n    1. __len__(self)\n        Returns the total number of samples in your dataset.\n\n        Used to determine how many batches to create.\n\n    2. __getitem__(self, index)\n        Returns a single data sample (and optionally its label) at the given index.\n\n        This is what gets batched together during training.\n'

In [None]:
%pip install ipywidgets




# Since I don't have a GPU now, I will be using my device instead!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Then to actually train the model
for it in tqdm(range(num_epochs)): # This tqdm here is to show a form of percentage of completion
    losses = []
    for x, y in train_loader:
        if not cuda: # Since we do not have the GPU available

            #x, y = x.cuda(), y.cuda()
            # Since we do not have a GPU
            x, y = x.to(device), y.to(device)

            # This clears the old gradients from the previous step!
            # Without this, gradients would accumulate across batches, which would mess up training
            optimizer.zero_grad() 


            ''' 
                We're implicitly calling the forward() method of your MatrixFactorization class.
                This is a PyTorch convention — when we call a model like a function, PyTorch automatically routes that call to the model’s forward() method.
            '''
            ''' 
                The __call__() method of nn.Module is overridden to internally call forward() — so when you do model(x), it’s really doing:
                def __call__(self, *input, **kwargs):
                    return self.forward(*input, **kwargs)
            '''
            outputs = model(x)
           
            ''' 
            Compares the model’s predictions (outputs) to the true labels (y) using Mean Squared Error.
            .squeeze() -> removes extra dimensions from outputs if needed.
            y.type(torch.float32) ->  ensures the target is in the correct format for MSELoss.
            '''
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))

            # Adds the scalar loss value to a list so you can track average loss for the epoch.
            losses.append(loss.item())

            # Backward pass
            # Computes gradients of the loss with respect to model parameters.
            # This is the core of backpropagation.
            '''  
                PyTorch traces the computation graph from the loss all the way back to the model’s parameters.
                It calculates the gradients — how much each parameter contributed to the error.
                These gradients are stored in each parameter’s .grad attribute.
                Without loss.backward(), your model has no idea how to improve
            '''
            loss.backward()


            ''' 
                Uses the gradients to update the model’s parameters.
                In your case, it adjusts the user and item embeddings to better predict ratings.
                This uses the gradients to update the weights.
                So the next time you run a forward pass, the model should (hopefully) make better predictions.
            '''
            optimizer.step()
    ''' 
    After all batches in the epoch are processed, it prints the average loss.
    This gives you a sense of how well the model is learning over time.
    '''      
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses)) # "it" here is the current epoch number
    

In [15]:
# By training the model, we will have tuned latent factors for movies and users
c = 0
uw = 0 # This is the user's weight 
iw = 0 # This is the item's weight

''' 
On the first iteration (c == 0), it stores the first parameter’s data in uw (likely the user embeddings).

On the second iteration, it stores the next parameter’s data in iw (likely the item embeddings).

It prints the name and raw tensor values of each parameter.
'''
for name, param in model.named_parameters(): # Same thing we did before
    if param.requires_grad: # This is to check if it requires a gradient
        print(name, param.data)
        if c == 0:
            uw = param.data
            c += 1
        else:
            iw = param.data
            


user_factors.weight tensor([[ 1.4220,  1.2288,  0.5815,  ...,  1.4883,  1.5858,  1.0978],
        [ 1.9661,  1.4178,  0.4871,  ...,  0.8738,  0.7888,  1.1253],
        [-0.5772,  0.2667,  3.0283,  ..., -0.9325,  0.6023,  1.7131],
        ...,
        [ 1.0387,  1.0115,  1.3287,  ..., -0.0240, -0.4031,  2.5860],
        [ 1.2045,  0.8297,  0.9233,  ...,  1.2070,  1.7543,  0.7149],
        [ 0.4663,  0.3207,  1.4207,  ...,  1.3703,  1.4531,  1.7608]])
item_factors.weight tensor([[ 0.4195,  0.1586,  0.7142,  ...,  0.6684,  0.4013,  0.1998],
        [ 0.6031,  0.4752,  0.5066,  ...,  0.4202,  0.7134, -0.0458],
        [ 0.2002,  0.5248,  0.5411,  ...,  0.4468,  0.5227,  0.8026],
        ...,
        [ 0.3484,  0.2932,  0.3300,  ...,  0.3342,  0.3295,  0.3483],
        [ 0.3907,  0.4831,  0.4033,  ...,  0.4410,  0.4382,  0.3949],
        [ 0.3654,  0.3850,  0.4069,  ...,  0.4131,  0.3748,  0.3985]])


In [16]:
# Then to check the trained movie embeddings
''' 
    model.item_factors.weight : Accesses the raw weight matrix of the item embedding layer — shape: [n_items, n_factors]
    .data : Gets the actual tensor data (bypassing autograd tracking)
    .cpu() : Moves the tensor from GPU to CPU (important if you're using CUDA)
    .numpy() : Converts the PyTorch tensor into a NumPy array
'''
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [17]:
trained_movie_embeddings # Display the embedding

array([[ 0.4195042 ,  0.15856628,  0.714191  , ...,  0.66836995,
         0.4013267 ,  0.19979613],
       [ 0.60311675,  0.47524107,  0.5065975 , ...,  0.4201884 ,
         0.713357  , -0.04575742],
       [ 0.20023791,  0.5248257 ,  0.54112893, ...,  0.44683835,
         0.5226808 ,  0.8025828 ],
       ...,
       [ 0.3483773 ,  0.29323405,  0.329968  , ...,  0.33417743,
         0.32953516,  0.34830368],
       [ 0.39071354,  0.4831054 ,  0.40333667, ...,  0.44097462,
         0.4382096 ,  0.39491692],
       [ 0.36535263,  0.38500917,  0.40688872, ...,  0.4131225 ,
         0.37478933,  0.39847574]], shape=(9724, 8), dtype=float32)

In [18]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state = 0).fit(trained_movie_embeddings)
# Here, we are clustering the learned movie embeddings usu=ing the K-Means algorithm - This is a classic unsupervised learning technique
'''
KMeans(...): Initializes the K-Means clustering model.

n_clusters=10: We’re asking it to group the movies into 10 distinct clusters based on their learned embeddings.

random_state=0: Sets a seed for reproducibility — ensures you get the same clusters every time you run it.

.fit(trained_movie_embeddings): Applies K-Means to your movie vectors (from the embedding layer).

Each movie is represented as a vector in a latent space (e.g., 20-dimensional if n_factors=20).

K-Means finds 10 centroids and assigns each movie to the nearest one.
'''

print(type(movie_names))

<class 'dict'>


In [None]:
''' 
    It can be seen here that the movies that are in the same cluster tend to have similar genres.
    Also note that the algorithm is unfamiliar with the movie name
    and only obtained the relationships by looking at the numbers representing how users have responded to the movies selection

'''
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []

    '''
    kmeans.labels contains the cluster assignment for each movie. -> it is a numpy array itself
    np.where(...)[0] finds the indices of movies that belong to the current cluster.
    movidx is the index of a movie in your embedding array.   
    '''
    for movidx in np.where(kmeans.labels_ == cluster)[0]: # Check for the movies in the same cluster!
        ''' 
            Here,
            We are converting the internal index (movidx) back to the actual movieId used in your dataset.
            idx2movieid is a mapping you created earlier to reverse the embedding index. 
        '''
        movid = train_set.idx2movieid[movidx]

        ''' 
          We’re checking how many ratings this movie received in your dataset.
          This gives you a measure of popularity or engagement.  
        '''
        rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]

        ''' 
            We’re building a list of tuples: (movie title, rating count) for all movies in the cluster
        '''
        movs.append((movie_names[movid],rat_count))

    ''' 
    tup is just a placeholder name for each element in the list movs.
    Since each element is a tuple, tup[1] accesses the second item — the rating count.
    This tells sorted() to sort the list based on rating count.
    Note that "tup" here represents each individual tuple that we have
    when you use a lambda function inside something like sorted(), map(), or filter(), you're applying that function to each individual element of the iterable you're working with.
    '''
    for mov in sorted(movs, key = lambda tup : tup[1], reverse = True)[:10]:
        print("\t", mov[0])
