<a href="https://colab.research.google.com/github/PrathamHandique/Recommendation-system-using-matrix-factorization/blob/main/Recommendation_system_(collaborative_system%2Bmatrix_factorization).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  4361k      0 --:--:-- --:--:-- --:--:-- 4381k


In [4]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip','r') as zip_ref:
  zip_ref.extractall('data')

In [5]:
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [6]:
print('The dimensions movies :', movies_df.shape,'\nThe dimensions ratings:', ratings_df.shape)


The dimensions movies : (9742, 3) 
The dimensions ratings: (100836, 4)


In [11]:
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [13]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [14]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

# Note: This isn't 'good' practice, in a MLops sense but we'll roll with this since the data is already loaded in memory.
class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [15]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0034, 0.0182, 0.0234,  ..., 0.0084, 0.0013, 0.0141],
        [0.0154, 0.0396, 0.0436,  ..., 0.0316, 0.0128, 0.0124],
        [0.0232, 0.0077, 0.0252,  ..., 0.0346, 0.0371, 0.0323],
        ...,
        [0.0217, 0.0099, 0.0193,  ..., 0.0245, 0.0254, 0.0280],
        [0.0382, 0.0104, 0.0401,  ..., 0.0014, 0.0192, 0.0441],
        [0.0331, 0.0088, 0.0262,  ..., 0.0044, 0.0393, 0.0297]])
item_factors.weight tensor([[0.0060, 0.0090, 0.0118,  ..., 0.0157, 0.0104, 0.0491],
        [0.0257, 0.0261, 0.0161,  ..., 0.0309, 0.0405, 0.0487],
        [0.0184, 0.0093, 0.0141,  ..., 0.0203, 0.0458, 0.0224],
        ...,
        [0.0173, 0.0477, 0.0050,  ..., 0.0208, 0.0167, 0.0325],
        [0.0491, 0.0057, 0.0428,  ..., 0.0500, 0.0185, 0.0318],
        [0.0264, 0.0086, 0.0190,  ..., 0.0285, 0.0166, 0.0496]])


In [16]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.07187554678941
iter #1 Loss: 4.750989670983426
iter #2 Loss: 2.4781203247266372
iter #3 Loss: 1.7230147273104808
iter #4 Loss: 1.3473286941118048
iter #5 Loss: 1.1296189569579769
iter #6 Loss: 0.9921966998407683
iter #7 Loss: 0.9006981399609958
iter #8 Loss: 0.8375530336714033
iter #9 Loss: 0.7924036132124475
iter #10 Loss: 0.7595515797661646
iter #11 Loss: 0.7350000858609447
iter #12 Loss: 0.716069327014957
iter #13 Loss: 0.7019001473176297
iter #14 Loss: 0.6905783380424311
iter #15 Loss: 0.6818572474539583
iter #16 Loss: 0.6748022802438832
iter #17 Loss: 0.6696309211290428
iter #18 Loss: 0.6658900937982623
iter #19 Loss: 0.6629085077852162
iter #20 Loss: 0.6604578321809091
iter #21 Loss: 0.6593057487141057
iter #22 Loss: 0.6575911382430701
iter #23 Loss: 0.6569053874100526
iter #24 Loss: 0.6560583665286224
iter #25 Loss: 0.6554346549026857
iter #26 Loss: 0.6548448582061657
iter #27 Loss: 0.6537202365582969
iter #28 Loss: 0.6530918025879691
iter #29 Loss: 0.6518003429

In [17]:
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[ 1.3146,  1.1128,  1.0153,  ...,  0.9207,  1.3396,  0.6137],
        [ 1.0044,  0.8383,  0.8043,  ...,  1.3103,  1.4863,  1.2504],
        [-0.7817, -2.5211,  1.6048,  ...,  0.1280,  1.1720,  2.5751],
        ...,
        [ 1.0506,  1.6381, -1.0315,  ...,  1.3541,  1.8294,  1.9017],
        [ 1.4592,  0.6779,  0.7012,  ...,  1.0581,  0.9570,  1.5724],
        [ 0.7301,  0.2932,  1.3564,  ...,  1.6173,  0.7928,  1.7222]],
       device='cuda:0')
item_factors.weight tensor([[ 4.7097e-01,  3.1186e-01,  6.5932e-01,  ...,  3.1975e-01,
         -7.7565e-04,  5.3098e-01],
        [ 4.9905e-01,  1.8099e-01,  8.1378e-01,  ...,  4.2103e-02,
          4.6273e-01,  9.6118e-02],
        [ 1.5396e-01,  5.1049e-01,  3.8119e-01,  ...,  7.6577e-01,
          4.0427e-01,  6.0775e-01],
        ...,
        [ 3.3860e-01,  3.9983e-01,  3.2632e-01,  ...,  3.4324e-01,
          3.4228e-01,  3.5437e-01],
        [ 4.1469e-01,  3.3792e-01,  4.0670e-01,  ...,  4.1427e-01,
          

In [18]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [19]:
len(trained_movie_embeddings)

9724

In [21]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)





In [22]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Jurassic Park (1993)
	 Independence Day (a.k.a. ID4) (1996)
	 True Lies (1994)
	 Shrek (2001)
	 Beauty and the Beast (1991)
	 Stargate (1994)
	 Star Wars: Episode I - The Phantom Menace (1999)
	 X-Men (2000)
	 Monsters, Inc. (2001)
	 Twister (1996)
Cluster #1
	 Batman (1989)
	 Aladdin (1992)
	 Sixth Sense, The (1999)
	 Men in Black (a.k.a. MIB) (1997)
	 Die Hard (1988)
	 Good Will Hunting (1997)
	 Truman Show, The (1998)
	 Austin Powers: The Spy Who Shagged Me (1999)
	 Ocean's Eleven (2001)
	 Breakfast Club, The (1985)
Cluster #2
	 Batman & Robin (1997)
	 Super Mario Bros. (1993)
	 Joe Dirt (2001)
	 Toys (1992)
	 Speed 2: Cruise Control (1997)
	 Battlefield Earth (2000)
	 Mighty Morphin Power Rangers: The Movie (1995)
	 Superman IV: The Quest for Peace (1987)
	 Nutty Professor II: The Klumps (2000)
	 Twins (1988)
Cluster #3
	 Dances with Wolves (1990)
	 Clueless (1995)
	 Four Weddings and a Funeral (1994)
	 Spirited Away (Sen to Chihiro no kamikakushi) (2001)
	 Mars Attack