<a href="https://colab.research.google.com/github/Vigneshmanikandan98/Movie-Recommendation-System/blob/main/Movie_Recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Data Citation:
# F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on
# Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0   585k      0  0:00:01  0:00:01 --:--:--  585k


In [None]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [None]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

In [None]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

The dimensions of movies dataframe are: (9742, 3) 
The dimensions of ratings dataframe are: (100836, 4)


In [None]:
# Take a look at movies_df
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
# Take a look at ratings_df
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users*n_items, 'elements.')
print('----------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users*n_items) * 100, '% of the matrix is filled.')

Number of unique users: 610
Number of unique movies: 9724
The full rating matrix will have: 5931640 elements.
----------
Number of ratings: 100836
Therefore:  1.6999683055613624 % of the matrix is filled.


In [None]:
import torch
import numpy as np
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm

class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        # create user embeddings
        self.user_factors = torch.nn.Embedding(n_users, n_factors)
        # create item embeddings
        self.item_factors = torch.nn.Embedding(n_items, n_factors)
        self.user_factors.weight.data.uniform_(0, 0.05)
        self.item_factors.weight.data.uniform_(0, 0.05)

    def forward(self, data):
        # matrix multiplication
        users, items = data[:,0], data[:,1]
        return (self.user_factors(users)*self.item_factors(items)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

In [None]:
# Creating the dataloader (necessary for PyTorch)
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

class Loader(Dataset):
    def __init__(self):
        self.ratings = ratings_df.copy()

        # Extract all user IDs and movie IDs
        users = ratings_df.userId.unique()
        movies = ratings_df.movieId.unique()

        #--- Producing new continuous IDs for users and movies ---

        # Unique values : index
        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}

        # Obtained continuous ID for users and movies
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        # return the id from the indexed values as noted in the lambda function down below.
        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])


        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.x, self.y = torch.tensor(self.x), torch.tensor(self.y) # Transforms the data to tensors (ready for torch models.)

    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def __len__(self):
        return len(self.ratings)

In [None]:
num_epochs = 128
cuda = torch.cuda.is_available()

print("Is running on GPU:", cuda)

model = MatrixFactorization(n_users, n_items, n_factors=8)
print(model)
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
# GPU enable if you have a GPU...
if cuda:
    model = model.cuda()

# MSE loss
loss_fn = torch.nn.MSELoss()

# ADAM optimizier
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train data
train_set = Loader()
train_loader = DataLoader(train_set, 128, shuffle=True)

Is running on GPU: True
MatrixFactorization(
  (user_factors): Embedding(610, 8)
  (item_factors): Embedding(9724, 8)
)
user_factors.weight tensor([[0.0113, 0.0490, 0.0059,  ..., 0.0466, 0.0288, 0.0363],
        [0.0429, 0.0443, 0.0231,  ..., 0.0466, 0.0053, 0.0321],
        [0.0369, 0.0491, 0.0149,  ..., 0.0498, 0.0132, 0.0177],
        ...,
        [0.0327, 0.0286, 0.0466,  ..., 0.0220, 0.0048, 0.0377],
        [0.0061, 0.0343, 0.0133,  ..., 0.0007, 0.0416, 0.0437],
        [0.0019, 0.0153, 0.0040,  ..., 0.0155, 0.0427, 0.0244]])
item_factors.weight tensor([[0.0396, 0.0033, 0.0470,  ..., 0.0472, 0.0312, 0.0138],
        [0.0406, 0.0140, 0.0185,  ..., 0.0437, 0.0317, 0.0488],
        [0.0365, 0.0248, 0.0419,  ..., 0.0010, 0.0312, 0.0467],
        ...,
        [0.0484, 0.0051, 0.0082,  ..., 0.0009, 0.0017, 0.0400],
        [0.0485, 0.0328, 0.0127,  ..., 0.0209, 0.0391, 0.0002],
        [0.0352, 0.0238, 0.0047,  ..., 0.0448, 0.0387, 0.0018]])


In [24]:
from tqdm.notebook import tqdm

for it in tqdm(range(num_epochs)):
    losses = []
    for x, y in train_loader:
         if cuda:
            x, y = x.cuda(), y.cuda()
            optimizer.zero_grad()
            outputs = model(x)
            loss = loss_fn(outputs.squeeze(), y.type(torch.float32))
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
    print("iter #{}".format(it), "Loss:", sum(losses) / len(losses))

  0%|          | 0/128 [00:00<?, ?it/s]

iter #0 Loss: 0.3287375989526056
iter #1 Loss: 0.32798080957556136
iter #2 Loss: 0.3274794187642596
iter #3 Loss: 0.32696165959468954
iter #4 Loss: 0.32644141998690396
iter #5 Loss: 0.3260871819811424
iter #6 Loss: 0.3257474008058836
iter #7 Loss: 0.325338960847425
iter #8 Loss: 0.32486176224151236
iter #9 Loss: 0.32456033457505523
iter #10 Loss: 0.3240558014341115
iter #11 Loss: 0.32370511690145215
iter #12 Loss: 0.3233014867087911
iter #13 Loss: 0.32307795225288055
iter #14 Loss: 0.3226572422471446
iter #15 Loss: 0.3222209486524163
iter #16 Loss: 0.3219958616014059
iter #17 Loss: 0.3217149246011289
iter #18 Loss: 0.3214298426953669
iter #19 Loss: 0.32103589022068807
iter #20 Loss: 0.32082160274843274
iter #21 Loss: 0.3203890883567067
iter #22 Loss: 0.32023823618661934
iter #23 Loss: 0.319795006403917
iter #24 Loss: 0.3195263454075997
iter #25 Loss: 0.31922972572939046
iter #26 Loss: 0.3189954405584311
iter #27 Loss: 0.31853061012432055
iter #28 Loss: 0.3182943702349203
iter #29 Loss:

In [25]:
# By training the model, we will have tuned latent factors for movies and users.
c = 0
uw = 0
iw = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data)
        if c == 0:
          uw = param.data
          c +=1
        else:
          iw = param.data

user_factors.weight tensor([[ 1.0109,  1.1406,  0.8939,  ...,  1.4065,  1.4607,  1.6050],
        [ 2.3191,  0.0363,  0.9456,  ...,  0.6386,  1.6331,  0.3521],
        [ 2.1766,  3.0098,  0.9442,  ...,  1.0244, -1.1365, -2.4734],
        ...,
        [ 2.0447,  0.8554,  1.3550,  ...,  0.3488,  0.1363,  1.5179],
        [ 1.5290,  0.5399,  0.6425,  ...,  1.3576,  1.2623,  1.1841],
        [ 1.0915,  1.4271,  1.1780,  ...,  0.6416,  0.6126,  1.4351]],
       device='cuda:0')
item_factors.weight tensor([[0.2312, 0.4509, 0.5444,  ..., 0.5988, 0.6530, 0.5244],
        [0.2156, 0.6003, 0.3239,  ..., 0.4338, 0.6990, 0.1447],
        [0.5114, 0.6834, 0.5852,  ..., 0.6342, 0.1578, 0.6015],
        ...,
        [0.4126, 0.3671, 0.3693,  ..., 0.3436, 0.3633, 0.4016],
        [0.4515, 0.4349, 0.4141,  ..., 0.4358, 0.4380, 0.4035],
        [0.4470, 0.4350, 0.4168,  ..., 0.4535, 0.4496, 0.4118]],
       device='cuda:0')


In [26]:
trained_movie_embeddings = model.item_factors.weight.data.cpu().numpy()

In [27]:
len(trained_movie_embeddings) # unique movie factor weights

9724

In [28]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)

In [29]:
for cluster in range(10):
  print("Cluster #{}".format(cluster))
  movs = []
  for movidx in np.where(kmeans.labels_ == cluster)[0]:
    movid = train_set.idx2movieid[movidx]
    rat_count = ratings_df.loc[ratings_df['movieId']==movid].count().iloc[0]
    movs.append((movie_names[movid], rat_count))
  for mov in sorted(movs, key=lambda tup: tup[1], reverse=True)[:10]:
    print("\t", mov[0])

Cluster #0
	 Braveheart (1995)
	 Happy Gilmore (1996)
	 Back to the Future Part II (1989)
	 Top Gun (1986)
	 Patriot, The (2000)
	 Pocahontas (1995)
	 Tombstone (1993)
	 Crow, The (1994)
	 Predator (1987)
	 Beverly Hills Cop III (1994)
Cluster #1
	 Batman Forever (1995)
	 Cliffhanger (1993)
	 Broken Arrow (1996)
	 Nutty Professor, The (1996)
	 Santa Clause, The (1994)
	 Mission: Impossible II (2000)
	 Honey, I Shrunk the Kids (1989)
	 Lost World: Jurassic Park, The (1997)
	 Austin Powers in Goldmember (2002)
	 Coneheads (1993)
Cluster #2
	 Forrest Gump (1994)
	 Shawshank Redemption, The (1994)
	 Pulp Fiction (1994)
	 Silence of the Lambs, The (1991)
	 Matrix, The (1999)
	 Star Wars: Episode IV - A New Hope (1977)
	 Jurassic Park (1993)
	 Terminator 2: Judgment Day (1991)
	 Schindler's List (1993)
	 Fight Club (1999)
Cluster #3
	 Net, The (1995)
	 Grease (1978)
	 Sting, The (1973)
	 Casper (1995)
	 Pinocchio (1940)
	 Hot Shots! Part Deux (1993)
	 Mask of Zorro, The (1998)
	 Sabrina (199