<a href="https://colab.research.google.com/github/Satya1804/MovieRecommendationSystem/blob/main/MovieRecommendationSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('/content/movies.csv')
ratings_df = pd.read_csv('/content/ratings.csv')

In [3]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [4]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')
print("We have an incredibly sparse matrix to work with here.")
print("And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2")
print("You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.")
print("One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data")

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.
We have an incredibly sparse matrix to work with here.
And... as you can imagine, as the number of users and products grow, the number of elements will increase by n*2
You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data


In [7]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors) # think of this as a lookup table for the input.
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors) # think of this as a lookup table for the input.
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)
    # def forward(self, user, item):
    # 	# matrix multiplication
    #     return (self.user_factors(user)*self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [8]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader # package that helps transform your data to machine learning readiness

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [9]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0235, 0.0452, 0.0074,  ..., 0.0362, 0.0451, 0.0053],
        [0.0076, 0.0350, 0.0089,  ..., 0.0166, 0.0347, 0.0323],
        [0.0057, 0.0189, 0.0202,  ..., 0.0214, 0.0144, 0.0150],
        ...,
        [0.0263, 0.0028, 0.0406,  ..., 0.0029, 0.0255, 0.0028],
        [0.0120, 0.0463, 0.0196,  ..., 0.0457, 0.0152, 0.0049],
        [0.0204, 0.0249, 0.0179,  ..., 0.0298, 0.0272, 0.0146]])
item_factors.weight tensor([[0.0165, 0.0042, 0.0128,  ..., 0.0264, 0.0118, 0.0021],
        [0.0439, 0.0125, 0.0195,  ..., 0.0148, 0.0159, 0.0202],
        [0.0395, 0.0045, 0.0417,  ..., 0.0190, 0.0087, 0.0249],
        ...,
        [0.0240, 0.0102, 0.0493,  ..., 0.0212, 0.0299, 0.0077],
        [0.0416, 0.0208, 0.0116,  ..., 0.0054, 0.0469, 0.0114],
        [0.0102, 0.0015, 0.0297,  ..., 0.0379, 0.0104, 0.0120]])


In [10]:
for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for it in tqdm(range(num_epochs)):


  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 11.071438737932196
iter #1 Loss: 4.743874272719252
iter #2 Loss: 2.4724724450087185
iter #3 Loss: 1.7205757672714097
iter #4 Loss: 1.345485533025059
iter #5 Loss: 1.1281819902549541
iter #6 Loss: 0.9914557439119077
iter #7 Loss: 0.9001632616453364
iter #8 Loss: 0.8373857118227155
iter #9 Loss: 0.7923772527935541
iter #10 Loss: 0.7593227410830822
iter #11 Loss: 0.7347372560029103
iter #12 Loss: 0.7159848694647024
iter #13 Loss: 0.7016315802477943
iter #14 Loss: 0.6904978406081345
iter #15 Loss: 0.6819261388503356
iter #16 Loss: 0.67459446978448
iter #17 Loss: 0.6700552663373464
iter #18 Loss: 0.6656994680084553
iter #19 Loss: 0.6629660759631752
iter #20 Loss: 0.6603069994882279
iter #21 Loss: 0.6587062258330093
iter #22 Loss: 0.6577252529296779
iter #23 Loss: 0.6565811212368423
iter #24 Loss: 0.655928984473502
iter #25 Loss: 0.6546892456067395
iter #26 Loss: 0.6542250948206423
iter #27 Loss: 0.6534330811597369
iter #28 Loss: 0.6523407373558446
iter #29 Loss: 0.651117366015

In [11]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data
        #print('param_data', param_data)

user_factors.weight tensor([[ 1.0737,  1.7968,  0.7281,  ...,  0.6339,  1.1770,  1.3690],
        [-0.2238,  1.6044,  0.7539,  ...,  2.6149,  0.0746,  1.4228],
        [-0.2924,  1.2684,  2.6733,  ...,  1.5580,  1.3260,  0.5337],
        ...,
        [ 1.4336, -0.1895,  1.2310,  ...,  0.5962,  1.9303,  0.5761],
        [ 1.3056,  0.6545,  1.2273,  ...,  1.3695,  0.9514,  0.8594],
        [ 0.3330,  1.2301,  0.5134,  ...,  1.4054,  1.2295,  1.6671]],
       device='cuda:0')
item_factors.weight tensor([[0.5423, 0.6910, 0.3520,  ..., 0.4242, 0.3032, 0.6180],
        [0.1935, 0.4324, 0.6437,  ..., 0.8284, 0.1717, 0.4360],
        [0.2246, 0.3878, 0.3560,  ..., 0.6139, 0.3850, 0.8133],
        ...,
        [0.3402, 0.3262, 0.3711,  ..., 0.3367, 0.3454, 0.3226],
        [0.4201, 0.3929, 0.3822,  ..., 0.3761, 0.4177, 0.3825],
        [0.3762, 0.3712, 0.3793,  ..., 0.4070, 0.3806, 0.3812]],
       device='cuda:0')


In [12]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [13]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [14]:
from sklearn.cluster import KMeans
# Fit the clusters based on the movie weights
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)



In [15]:
'''It can be seen here that the movies that are in the same cluster tend to have similar genres. Also note that the algorithm is unfamiliar with the movie name
and only obtained the relationships by looking at the numbers representing how users have responded to the movie selections.'''
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count()[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Apollo 13 (1995)
	 Mask, The (1994)
	 Babe (1995)
	 Casablanca (1942)
	 Avatar (2009)
	 Back to the Future Part III (1990)
	 Sin City (2005)
	 Snow White and the Seven Dwarfs (1937)
	 What's Eating Gilbert Grape (1993)
	 40-Year-Old Virgin, The (2005)
Cluster #1
	 Jurassic Park (1993)
	 Toy Story (1995)
	 Fugitive, The (1993)
	 Speed (1994)
	 Gladiator (2000)
	 Shrek (2001)
	 Men in Black (a.k.a. MIB) (1997)
	 Dances with Wolves (1990)
	 Pirates of the Caribbean: The Curse of the Black Pearl (2003)
	 Beauty and the Beast (1991)
Cluster #2
	 Independence Day (a.k.a. ID4) (1996)
	 Twister (1996)
	 Net, The (1995)
	 Jumanji (1995)
	 American Pie (1999)
	 Outbreak (1995)
	 Cliffhanger (1993)
	 Happy Gilmore (1996)
	 While You Were Sleeping (1995)
	 Matrix Reloaded, The (2003)
Cluster #3
	 Godzilla (1998)
	 Buffy the Vampire Slayer (1992)
	 Joe Dirt (2001)
	 Speed 2: Cruise Control (1997)
	 Grease 2 (1982)
	 Rocky V (1990)
	 Karate Kid, Part III, The (1989)
	 Rambo III (1988)
	