In [46]:
# Data Citation
# F. Maxwell Harper and Joseph A. Konstn. 2015. The MovieLens Datasets: History and Context. 
! curl https://grouplens.org/datasets/movielens/lateste

In [47]:
import os
print(os.getcwd())


/Users/abosede/PROJECTS/DATA SCIENCE/RECOMMENDER SYSTEMS/Customer-focused-movie-recommender-systems


In [48]:
# This is to unzip our file
import zipfile
with zipfile.ZipFile('/Users/abosede/PROJECTS/DATA SCIENCE/RECOMMENDER SYSTEMS/Customer-focused-movie-recommender-systems/ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [49]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('/Users/abosede/PROJECTS/DATA SCIENCE/RECOMMENDER SYSTEMS/Customer-focused-movie-recommender-systems/data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('/Users/abosede/PROJECTS/DATA SCIENCE/RECOMMENDER SYSTEMS/Customer-focused-movie-recommender-systems/data/ml-latest-small/ratings.csv')

In [50]:
print('The dimensions of the movies dataframe are:', movies_df.shape, '\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of the movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [51]:
# A quick look at the movies data frame (movies_df)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [52]:
# Quick look at the ratings data frame
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [86]:
# This is to create a Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
# From the data frame this brings out "movieId" as the index column. Then the title is what to be returned
# The generated series is then converted to a dictionary instead, that is the movie id and title become key - value mappings

n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

n_users_1 = len(ratings_df["userId"].unique())
print("Number of unique users:", n_users)
print("Number of unique movies", n_items)
print("Number of unique users (2):", n_users_1)

print("The full rating matrix will have:", n_users*n_items, 'elements.')

print('----------')
print('Number of ratings:', len(ratings_df))
print("Therefore: ", len(ratings_df)/ (n_users * n_items) * 100, '% of the matrix is filled.')

print('We have an incredibly sparse matrix to work with here')
print('We can imagine as the number of users and products grow, the number of elements will increase by n*2')
print('We are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.')
print('One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don\'t need all the data')


Number of unique users: 610
Number of unique movies 9724
Number of unique users (2): 610
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here
We can imagine as the number of users and products grow, the number of elements will increase by n*2
We are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [54]:
# This is to install the needed libraries -> We would only be using pytorch here
%pip install torch torchvision torchaudio
# torch -> the core Pytorch library
# torchvision -> for image-related tasks (like datasets and transforms)
# torchaudio -> for audio processing (if needed!)


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [55]:
%pip install tqdm # This is for progress bars in

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [73]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm  #as tqdm
# tqdm is a Python library that adds progress bars to loops - super userful for tracking long-running tasks
# tqdm_notebook : This is a variant that displays the progess bar nicely inside a Jupyter Notebook interface

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors = 20):
        super().__init__() # This is to set up our parent class -> "torch.nn.Module"
        
        # For the embedding layer
        '''
            This creates a lookup table that maps each user ID to a dense vector of size "n_factors"ArithmeticError
            n_users: This is is the number of unique users - each one gets its own vector
            n_factors: The size of each embedding vector - this defines the dimensionality of the latent space

            So, if we have 1000 users and "n_factors = 20", this layer will learn a "1000 x 20" matrix during training

            # Similar users will have similar embeddings and then similar items will have similar embeddings
        '''
        # Create the user embedding layer -> This would later be called on the actual data
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # Think of this as a look up table for the input

        # Create the item embedding layer
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # Think of this as a lookup table for the input

        # Further explanation
        '''
        .weight -> This accesses the actual weight matrix - the lookup table of user embeddings
        .data -> Refers to the raw tensor data (bypassing autograd tracking)
        .uniform_(0,0.05) -> Fills the tensor with random values uniformly distributed between 0 and 0.05
        '''
        # Why???
        '''
         Initially, PyTorch initializes weights randomly, but you can override that to control the range
         Initializing with small values like this can help:
            Prevent exploding gradients early in training
            Ensure embeddings start close to zero, which can stabilize learning
            Avoid biasing the model toward any particular direction before training begins
        '''
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0,0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]

        # Note
        '''
            data[:,0] -> This grabs all rows in the first column
        '''
        # Multiply the matrices
        return (self.user_factors(users) * self.item_factors(items)).sum(1)

        '''
            self.user_factors(users) -> This retrieves the embedding vector for each userID in the users tensor
            * -> This multiplies each user vector with its corresponding item vector — element by element. Still shape: [batch_size, n_factors]
            .sum(1) -> This sums across the embedding dimensions (n_factors) for each user-item pair, producing a single scalar score per pair. Final shape: [batch_size]
        '''
    
    def predict(self, user, item):
        return self.forward(user, item)
    
    

In [74]:
# Crearing the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # This is the package that helps transform the data to machine learning readiness

# Note: This is not good practice, in a MLOps sense but we'll roll with this since the data is already loaded in memory
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        # Producing new continuous IDs for users and movies --

        # This is like forward mapping (That is, creating a mapping for the movieID and the actual ID)
        # Unique values : index
        self.userid2idx  = {o:i for i, o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        # Then to map backwards -> That is, 
        self.idx2userid = {i:o for o, i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as nootes in the lambda function down below
        self.ratings.movieId = ratings_df.movieId.apply(lambda x : self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        # Then to classify the predictor and target!
        self.x = self.ratings.drop(['rating', 'timestamp'], axis =1).values
        self.y = self.ratings['rating'].values

        # Then to convert the columns into model ready columns
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)


    '''
        These are helper functions
    '''

    # This is to get the item (From the tensor) -> Given an index
    # How this function can be used
    # Say we instantiate like : loader = Loader()
    # Then we can have: item = loader[index]
    def __getitem__(self, index):
        return (self.x[index], self.y[index]) 
    
    # How can this be called?? 
    # Say we have the same above, then we can call it like
    # len(loader) -> This will directly trigger this function
    def __len__(self):
        return len(self.ratings)




In [75]:
num_epochs = 128 # This sets the number of training epochs — how many times your model will see the entire dataset during training.
cuda = torch.cuda.is_available() # This checks if a CUDA-compatible GPU is available on your system.
# CUDA is NVIDIA’s parallel computing platform that allows PyTorch to run operations on the GPU, which is much faster than CPU for deep learning tasks.

print("Is running on GPU:", cuda)


# Then to instantiate our model
model = MatrixFactorization(n_users, n_items, n_factors = 8)

print(model) # this prints out the super class's __str__ function

'''
".named_parameters()" -> comes from the "nn.Module" class - from which "MatrixFactorization" is inheriting from
.named_parameters() -> This returns an iterator over (name, parameter) pairs for all parameters in the model that have "requires_grad=True" 
These parameters are typically the weights and biases of the layers we have defined
In this case
The model has:
user_factors: nn.Embedding(610,8) layer
item_factors: nn.Embedding(9724,8) layer

Each of these layers has a ".weight" tensor - that's what we are accessing

## param.requires_grad
-- This checks whether the parameter should be updated during backpropagation. If "True", it means the parameter is learnable - gradients will be computed for it and it will be updated during training
'''
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

# GPU enable if we have one! -> In this example, I do not have so I am not going to be doing that
if cuda:
    model = model.cuda()


# MSE loss
# This is the mean square error loss
# This calculates how far off your model's predictions are from the actual values
# The goal is to minimize this loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
'''
Adam stands for Adaptive Moment Estimation — it’s a popular optimizer that combines the benefits of SGD and RMSProp.
It adjusts learning rates for each parameter individually based on estimates of first and second moments of the gradients.
model.parameters() tells it which weights to update — in your case, the embeddings for users and items.
lr=1e-3 sets the learning rate, which controls how big each update step is. A smaller value means slower but more stable learning.
'''
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)


# Train the data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle = True)

# What happens under the hood
'''
    Calls __len__() to know how many samples are in train_set.
    Randomly shuffles the indices (if shuffle=True).
    Uses __getitem__() to fetch 128 samples at a time (your batch size).

    1. __len__(self)
        Returns the total number of samples in your dataset.

        Used to determine how many batches to create.

    2. __getitem__(self, index)
        Returns a single data sample (and optionally its label) at the given index.

        This is what gets batched together during training.
'''

# Note :
# This DataLoader(., ., .) returns the batches of data that you can loop through in training






Is running on GPU: False
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0322, 0.0162, 0.0073,  ..., 0.0486, 0.0356, 0.0100],
        [0.0465, 0.0127, 0.0328,  ..., 0.0438, 0.0047, 0.0392],
        [0.0099, 0.0336, 0.0370,  ..., 0.0253, 0.0050, 0.0235],
        ...,
        [0.0309, 0.0057, 0.0274,  ..., 0.0278, 0.0120, 0.0396],
        [0.0009, 0.0242, 0.0478,  ..., 0.0146, 0.0088, 0.0189],
        [0.0067, 0.0133, 0.0033,  ..., 0.0302, 0.0424, 0.0358]])
item_factors.weight tensor([[0.0169, 0.0224, 0.0356,  ..., 0.0080, 0.0438, 0.0096],
        [0.0055, 0.0264, 0.0136,  ..., 0.0195, 0.0009, 0.0021],
        [0.0202, 0.0018, 0.0445,  ..., 0.0069, 0.0260, 0.0261],
        ...,
        [0.0190, 0.0292, 0.0172,  ..., 0.0374, 0.0237, 0.0030],
        [0.0040, 0.0097, 0.0204,  ..., 0.0306, 0.0244, 0.0250],
        [0.0296, 0.0076, 0.0053,  ..., 0.0074, 0.0270, 0.0144]])


'\n    Calls __len__() to know how many samples are in train_set.\n    Randomly shuffles the indices (if shuffle=True).\n    Uses __getitem__() to fetch 128 samples at a time (your batch size).\n\n    1. __len__(self)\n        Returns the total number of samples in your dataset.\n\n        Used to determine how many batches to create.\n\n    2. __getitem__(self, index)\n        Returns a single data sample (and optionally its label) at the given index.\n\n        This is what gets batched together during training.\n'

In [None]:
%pip install ipywidgets




# Since I don't have a GPU now, I will be using my device instead!
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Then to actually train the model
for it in tqdm(range(num_epochs)): # This tqdm here is to show a form of percentage of completion
    losses = []
    for x, y in train_loader:
        if not cuda: # Since we do not have the GPU available

            #x, y = x.cuda(), y.cuda()
            # Since we do not have a GPU
            x, y = x.to(device), y.to(device)

            # This clears the old gradients from the previous step!
            # Without this, gradients would accumulate across batches, which would mess up training
            optimizer.zero_grad() 


            ''' 
                We're implicitly calling the forward() method of your MatrixFactorization class.
                This is a PyTorch convention — when we call a model like a function, PyTorch automatically routes that call to the model’s forward() method.
            '''
            ''' 
                The __call__() method of nn.Module is overridden to internally call forward() — so when you do model(x), it’s really doing:
                def __call__(self, *input, **kwargs):
                    return self.forward(*input, **kwargs)
            '''
            outputs = model(x)
           
            ''' 
            Compares the model’s predictions (outputs) to the true labels (y) using Mean Squared Error.
            .squeeze() -> removes extra dimensions from outputs if needed.
            y.type(torch.float32) ->  ensures the target is in the correct format for MSELoss.
            '''
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))

            # Adds the scalar loss value to a list so you can track average loss for the epoch.
            losses.append(loss.item())

            # Backward pass
            # Computes gradients of the loss with respect to model parameters.
            # This is the core of backpropagation.
            '''  
                PyTorch traces the computation graph from the loss all the way back to the model’s parameters.
                It calculates the gradients — how much each parameter contributed to the error.
                These gradients are stored in each parameter’s .grad attribute.
                Without loss.backward(), your model has no idea how to improve
            '''
            loss.backward()


            ''' 
                Uses the gradients to update the model’s parameters.
                In your case, it adjusts the user and item embeddings to better predict ratings.
                This uses the gradients to update the weights.
                So the next time you run a forward pass, the model should (hopefully) make better predictions.
            '''
            optimizer.step()
    ''' 
    After all batches in the epoch are processed, it prints the average loss.
    This gives you a sense of how well the model is learning over time.
    '''      
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses)) # "it" here is the current epoch number
    

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.




iter #0 Loss: 11.062974008811912




iter #1 Loss: 4.74984562003673




iter #2 Loss: 2.4797207437796036




iter #3 Loss: 1.7224311370232384




iter #4 Loss: 1.3459932402457078




iter #5 Loss: 1.12832407880253




iter #6 Loss: 0.9913600663545773




iter #7 Loss: 0.9000576908515795




iter #8 Loss: 0.8370468522691484




iter #9 Loss: 0.7922164963057199




iter #10 Loss: 0.7592216306153288




iter #11 Loss: 0.7345603889771525




iter #12 Loss: 0.7158364505332133




iter #13 Loss: 0.7014839434759871




iter #14 Loss: 0.6903853893733872




iter #15 Loss: 0.6816589063042917




iter #16 Loss: 0.6749234650718984




iter #17 Loss: 0.6696698912963044




iter #18 Loss: 0.6657284767872791




iter #19 Loss: 0.662880948238869




iter #20 Loss: 0.6606109951655876




iter #21 Loss: 0.6589168006847352




iter #22 Loss: 0.6576331915135311




iter #23 Loss: 0.6566392505910191




iter #24 Loss: 0.6560486136429806




iter #25 Loss: 0.6552327153116918




iter #26 Loss: 0.6545690261016642




iter #27 Loss: 0.6535408505177135




iter #28 Loss: 0.6526688669009257




iter #29 Loss: 0.6513165795258459




iter #30 Loss: 0.6497822041968404




iter #31 Loss: 0.6479954571288249




iter #32 Loss: 0.6459598501926751




iter #33 Loss: 0.6433790534129603




iter #34 Loss: 0.6401054015014377




iter #35 Loss: 0.6363213225669667




iter #36 Loss: 0.6312697152347129




iter #37 Loss: 0.6257943874839599




iter #38 Loss: 0.6198350191872737




iter #39 Loss: 0.612304222614027




iter #40 Loss: 0.6045426296325505




iter #41 Loss: 0.5963591958968167




iter #42 Loss: 0.587930541315357




iter #43 Loss: 0.5792311881172475




iter #44 Loss: 0.5702402783075565




iter #45 Loss: 0.5613126850445863




iter #46 Loss: 0.5524204392269784




iter #47 Loss: 0.5437099814868821




iter #48 Loss: 0.5345601340765276




iter #49 Loss: 0.5261889609134742




iter #50 Loss: 0.5171160943768351




iter #51 Loss: 0.5089228232651193




iter #52 Loss: 0.5006013861691891




iter #53 Loss: 0.492875785959251




iter #54 Loss: 0.4855579925218815




iter #55 Loss: 0.4781785572543362




iter #56 Loss: 0.47111258799503297




iter #57 Loss: 0.4647420876408894




iter #58 Loss: 0.4583090230028339




iter #59 Loss: 0.452386036893438




iter #60 Loss: 0.4470422065764817




iter #61 Loss: 0.441737698041243




iter #62 Loss: 0.436886909309075




iter #63 Loss: 0.43188600075729




iter #64 Loss: 0.4275613516796059




iter #65 Loss: 0.42311784326273777




iter #66 Loss: 0.4190090391312154




iter #67 Loss: 0.41538831451671376




iter #68 Loss: 0.41154869316798176




iter #69 Loss: 0.40831811518082156




iter #70 Loss: 0.40485122088853476




iter #71 Loss: 0.40176459806522136




iter #72 Loss: 0.3986053315705152




iter #73 Loss: 0.395971967824522




iter #74 Loss: 0.3930539205897278




iter #75 Loss: 0.3904552736749806




iter #76 Loss: 0.3878802283652845




iter #77 Loss: 0.3856742569637783




iter #78 Loss: 0.38334255981339416




iter #79 Loss: 0.3809933732949235




iter #80 Loss: 0.3790951918262213




iter #81 Loss: 0.3768303426464802




iter #82 Loss: 0.3747891321551376




iter #83 Loss: 0.37313972457033123




iter #84 Loss: 0.3711614425903952




iter #85 Loss: 0.369625499103275




iter #86 Loss: 0.36786223218071884




iter #87 Loss: 0.3662983010036086




iter #88 Loss: 0.3647189442125069




iter #89 Loss: 0.36325103647742174




iter #90 Loss: 0.3617616201263999




iter #91 Loss: 0.3604673289216412




iter #92 Loss: 0.35920971573382465




iter #93 Loss: 0.35783726947032257




iter #94 Loss: 0.35664383877957534




iter #95 Loss: 0.355363259593091




iter #96 Loss: 0.3543192558368753




iter #97 Loss: 0.3531585120345433




iter #98 Loss: 0.35222962075078545




iter #99 Loss: 0.3509384055579374




iter #100 Loss: 0.3499969605942668




iter #101 Loss: 0.34905678947186713




iter #102 Loss: 0.34804996121958426




iter #103 Loss: 0.34731411969828124




iter #104 Loss: 0.34633020406144527




iter #105 Loss: 0.34550923090144464




iter #106 Loss: 0.34466463560002103




iter #107 Loss: 0.343695599661412




iter #108 Loss: 0.3429658977967228




iter #109 Loss: 0.3422993554870792




iter #110 Loss: 0.3417597337233536




iter #111 Loss: 0.34092424522499143




iter #112 Loss: 0.3401420983074583




iter #113 Loss: 0.3393864881266192




iter #114 Loss: 0.3387164035386543




iter #115 Loss: 0.3380129145751447




iter #116 Loss: 0.33750849875248023




iter #117 Loss: 0.33687965842294815




iter #118 Loss: 0.33634744603470496




iter #119 Loss: 0.33562071521194453




iter #120 Loss: 0.3349325218176479




iter #121 Loss: 0.3345030472628054




iter #122 Loss: 0.33415289183482905




iter #123 Loss: 0.33340966542662703




iter #124 Loss: 0.33287916358957437




iter #125 Loss: 0.3325187746756876




iter #126 Loss: 0.3318447589231324


100%|██████████| 128/128 [02:44<00:00,  1.29s/it]

iter #127 Loss: 0.3314169015035714





In [77]:
# By training the model, we will have tuned latent factors for movies and users
c = 0
uw = 0 # This is the user's weight 
iw = 0 # This is the item's weight

''' 
On the first iteration (c == 0), it stores the first parameter’s data in uw (likely the user embeddings).

On the second iteration, it stores the next parameter’s data in iw (likely the item embeddings).

It prints the name and raw tensor values of each parameter.
'''
for name, param in model.named_parameters(): # Same thing we did before
    if param.requires_grad: # This is to check if it requires a gradient
        print(name, param.data)
        if c == 0:
            uw = param.data
            c += 1
        else:
            iw = param.data
            


user_factors.weight tensor([[ 1.9317,  1.0556,  1.6530,  ...,  1.2161,  1.1279,  1.5724],
        [ 0.0724,  1.0322,  0.5583,  ...,  1.2977,  1.3532,  1.7125],
        [-0.1143,  1.1125,  0.4491,  ..., -1.4479, -1.5917,  0.0497],
        ...,
        [ 1.1743, -0.1102,  1.2213,  ...,  1.1230, -0.6135,  1.4597],
        [ 1.1222,  0.1168,  1.9397,  ...,  0.5850,  1.1815,  0.7851],
        [ 0.7736,  1.1622,  0.9645,  ...,  1.0936,  1.1879,  0.7607]])
item_factors.weight tensor([[ 0.1303,  0.7080,  0.4207,  ...,  0.5602,  0.9038,  0.4216],
        [ 0.5000,  0.7591,  0.4866,  ..., -0.0402,  0.6090,  0.3250],
        [ 0.4901,  0.3616,  0.4700,  ...,  0.1728,  0.5514,  0.7811],
        ...,
        [ 0.3503,  0.3607,  0.3510,  ...,  0.3706,  0.3557,  0.3315],
        [ 0.3885,  0.3934,  0.4060,  ...,  0.4148,  0.4091,  0.4083],
        [ 0.4075,  0.3850,  0.3836,  ...,  0.3861,  0.4039,  0.3897]])


In [78]:
# Then to check the trained movie embeddings
''' 
    model.item_factors.weight : Accesses the raw weight matrix of the item embedding layer — shape: [n_items, n_factors]
    .data : Gets the actual tensor data (bypassing autograd tracking)
    .cpu() : Moves the tensor from GPU to CPU (important if you're using CUDA)
    .numpy() : Converts the PyTorch tensor into a NumPy array
'''
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [79]:
trained_movie_embeddings # Display the embedding

array([[ 0.13031706,  0.7079638 ,  0.42074126, ...,  0.5602069 ,
         0.90380317,  0.42163917],
       [ 0.5000378 ,  0.7591344 ,  0.4865554 , ..., -0.04021692,
         0.6090155 ,  0.3250367 ],
       [ 0.4900761 ,  0.36158958,  0.46995148, ...,  0.17282481,
         0.5513736 ,  0.7810933 ],
       ...,
       [ 0.3503491 ,  0.36074647,  0.3510321 , ...,  0.37063277,
         0.35573378,  0.33152902],
       [ 0.38846377,  0.3933649 ,  0.4060266 , ...,  0.41478223,
         0.40909234,  0.4083269 ],
       [ 0.4075093 ,  0.38496774,  0.38359863, ...,  0.38611653,
         0.40391278,  0.38973466]], shape=(9724, 8), dtype=float32)

In [85]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state = 0).fit(trained_movie_embeddings)
# Here, we are clustering the learned movie embeddings usu=ing the K-Means algorithm - This is a classic unsupervised learning technique
'''
KMeans(...): Initializes the K-Means clustering model.

n_clusters=10: We’re asking it to group the movies into 10 distinct clusters based on their learned embeddings.

random_state=0: Sets a seed for reproducibility — ensures you get the same clusters every time you run it.

.fit(trained_movie_embeddings): Applies K-Means to your movie vectors (from the embedding layer).

Each movie is represented as a vector in a latent space (e.g., 20-dimensional if n_factors=20).

K-Means finds 10 centroids and assigns each movie to the nearest one.
'''

print(type(movie_names))

<class 'method'>


In [87]:
''' 
    It can be seen here that the movies that are in the same cluster tend to have similar genres.
    Also note that the algorithm is unfamiliar with the movie name
    and only obtained the relationships by looking at the numbers representing how users have responded to the movies selection

'''
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    movs = []

    '''
    kmeans.labels contains the cluster assignment for each movie. -> it is a numpy array itself
    np.where(...)[0] finds the indices of movies that belong to the current cluster.
    movidx is the index of a movie in your embedding array.   
    '''
    for movidx in np.where(kmeans.labels_ == cluster)[0]: # Check for the movies in the same cluster!
        ''' 
            Here,
            We are converting the internal index (movidx) back to the actual movieId used in your dataset.
            idx2movieid is a mapping you created earlier to reverse the embedding index. 
        '''
        movid = train_set.idx2movieid[movidx]

        ''' 
          We’re checking how many ratings this movie received in your dataset.
          This gives you a measure of popularity or engagement.  
        '''
        rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]

        ''' 
            We’re building a list of tuples: (movie title, rating count) for all movies in the cluster
        '''
        movs.append((movie_names[movid],rat_count))

    ''' 
    tup is just a placeholder name for each element in the list movs.
    Since each element is a tuple, tup[1] accesses the second item — the rating count.
    This tells sorted() to sort the list based on rating count.
    Note that "tup" here represents each individual tuple that we have
    when you use a lambda function inside something like sorted(), map(), or filter(), you're applying that function to each individual element of the iterable you're working with.
    '''
    for mov in sorted(movs, key = lambda tup : tup[1], reverse = True)[:10]:
        print("\t", mov[0])


Cluster #0


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 GoldenEye (1995)
	 Bourne Identity, The (2002)
	 Green Mile, The (1999)
	 Happy Gilmore (1996)
	 Casino (1995)
	 Mr. Holland's Opus (1995)
	 What's Eating Gilbert Grape (1993)
	 Pirates of the Caribbean: Dead Man's Chest (2006)
	 Sound of Music, The (1965)
	 Rocky Horror Picture Show, The (1975)
Cluster #1


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Braveheart (1995)
	 Schindler's List (1993)
	 Star Wars: Episode V - The Empire Strikes Back (1980)
	 Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
	 Lord of the Rings: The Fellowship of the Ring, The (2001)
	 Star Wars: Episode VI - Return of the Jedi (1983)
Cluster #2


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Jurassic Park (1993)
	 Independence Day (a.k.a. ID4) (1996)
	 Batman (1989)
	 Aladdin (1992)
	 True Lies (1994)
	 Lion King, The (1994)
	 Speed (1994)
	 Men in Black (a.k.a. MIB) (1997)
	 Mission: Impossible (1996)
	 Ace Ventura: Pet Detective (1994)
Cluster #3


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Dumb & Dumber (Dumb and Dumber) (1994)
	 X-Men (2000)
	 Twister (1996)
	 Net, The (1995)
	 There's Something About Mary (1998)
	 American Pie (1999)
	 Avatar (2009)
	 Star Wars: Episode II - Attack of the Clones (2002)
	 Nutty Professor, The (1996)
	 Star Wars: Episode III - Revenge of the Sith (2005)
Cluster #4


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Star Wars: Episode IV - A New Hope (1977)
	 Monty Python and the Holy Grail (1975)
	 2001: A Space Odyssey (1968)
	 Casablanca (1942)
	 Requiem for a Dream (2000)
	 Nightmare Before Christmas, The (1993)
	 Wizard of Oz, The (1939)
	 Big (1988)
	 Monty Python's Life of Brian (1979)
	 Spirited Away (Sen to Chihiro no kamikakushi) (2001)
Cluster #5


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Batman & Robin (1997)
	 Super Mario Bros. (1993)
	 Joe Dirt (2001)
	 Toys (1992)
	 Fantastic Four: Rise of the Silver Surfer (2007)
	 Speed 2: Cruise Control (1997)
	 Nutty Professor II: The Klumps (2000)
	 Nutty Professor, The (1963)
	 Bewitched (2005)
	 Highlander III: The Sorcerer (a.k.a. Highlander: The Final Dimension) (1994)
Cluster #6


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Pulp Fiction (1994)
	 Terminator 2: Judgment Day (1991)
	 Fight Club (1999)
	 Usual Suspects, The (1995)
	 American Beauty (1999)
	 Seven (a.k.a. Se7en) (1995)
	 Godfather, The (1972)
	 Fargo (1996)
	 Sixth Sense, The (1999)
	 Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Cluster #7


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Batman Forever (1995)
	 Waterworld (1995)
	 Crimson Tide (1995)
	 Outbreak (1995)
	 Matrix Reloaded, The (2003)
	 Armageddon (1998)
	 Mummy, The (1999)
	 Broken Arrow (1996)
	 Matrix Revolutions, The (2003)
	 Game, The (1997)
Cluster #8


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Apollo 13 (1995)
	 Shrek (2001)
	 Dances with Wolves (1990)
	 Beauty and the Beast (1991)
	 Star Wars: Episode I - The Phantom Menace (1999)
	 Titanic (1997)
	 Babe (1995)
	 E.T. the Extra-Terrestrial (1982)
	 Sleepless in Seattle (1993)
	 Up (2009)
Cluster #9


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou

	 Toy Story (1995)
	 Mask, The (1994)
	 Mrs. Doubtfire (1993)
	 Home Alone (1990)
	 Ghost (1990)
	 Jumanji (1995)
	 Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
	 Clueless (1995)
	 Harry Potter and the Chamber of Secrets (2002)
	 Cliffhanger (1993)


  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
  rat_count = ratings_df.loc[ratings_df['movieId'] == movid].cou