# Movie Ratings Recommendation using Matrix Factorization

In [30]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import pandas as pd
import numpy as np

### Loading Dataset

In [31]:
data = pd.read_csv("./movie_ratings_dataset/ratings.csv")

In [32]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [33]:
data['userId'] = data['userId'] - 1
data['movieId'] = data['movieId'] - 1

In [34]:
data.drop_duplicates(inplace=True)

In [35]:
data.dropna(inplace=True)

In [36]:
def fillID(df, col):
    m = dict({-1:-1})
    new_col = []
    for value in df[col].values:
        if value in m:
            new_col.append(m[value])
        else:
            new_index = max(m.values()) + 1
            m[value] = new_index
            new_col.append(new_index)
    df[col+"Index"] = new_col
    return

In [37]:
fillID(data, 'userId')
fillID(data, 'movieId')

In [38]:
def normalize(data):
    """
    Normalize the data to the range [0, 1].
    
    Parameters:
    data (numpy array): Input data to be normalized.
    
    Returns:
    normalized_data (numpy array): Normalized data.
    """
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data

In [39]:
# data['rating] = normalize(data['rating'].values)

### Train/Test Splitting

In [40]:
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8 # 80% Train, 20% Test
train = data[msk].copy()
validation = data[~msk].copy()

In [41]:
len(data), len(train), len(validation)

(100836, 80450, 20386)

In [42]:
train.columns

Index(['userId', 'movieId', 'rating', 'timestamp', 'userIdIndex',
       'movieIdIndex'],
      dtype='object')

In [43]:
num_users = data['userIdIndex'].nunique()
num_movies = data['movieIdIndex'].nunique()

In [44]:
print(num_users, num_movies)

610 9724


In [45]:
train.head()

Unnamed: 0,userId,movieId,rating,timestamp,userIdIndex,movieIdIndex
0,0,0,4.0,964982703,0,0
1,0,2,4.0,964981247,0,1
2,0,5,4.0,964982224,0,2
3,0,46,5.0,964983815,0,3
6,0,100,5.0,964980868,0,6


### Dataset and Dataloader

In [52]:
batch_size = 5000

train_features = torch.LongTensor(train[['userIdIndex', 'movieIdIndex']].values)
train_target = torch.Tensor(train[['rating']].values).float()

train_ds = TensorDataset(train_features, train_target)
dl_train = DataLoader(train_ds, batch_size, shuffle=True, num_workers=4)

val_features = torch.LongTensor(validation[['userIdIndex', 'movieIdIndex']].values)
val_target = torch.Tensor(validation[['rating']].values).float()

val_ds = TensorDataset(val_features, val_target)
dl_val = DataLoader(train_ds, batch_size, shuffle=True, num_workers=4)

In [53]:
xb, yb = next(iter(dl_train))
print(xb)
print(yb)

tensor([[ 605,  217],
        [  27, 1140],
        [ 605,  142],
        ...,
        [ 433,  915],
        [ 437,  873],
        [ 291, 1236]])
tensor([[3.5000],
        [3.5000],
        [2.5000],
        ...,
        [3.5000],
        [4.0000],
        [3.5000]])


### Matrix Factorization Model

In [54]:
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, emb_size)
        self.item_embedding = nn.Embedding(num_items, emb_size)
        
        # Initializing Embedding Matrices
        self.user_embedding.weight.data.uniform_(0, 0.05)
        self.item_embedding.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_embedding(u)
        v = self.item_embedding(v)
        return (u*v).sum(1)   

### Training Loop

In [73]:
def test(model):
    model.eval()
    total_loss = []
    for indices, ratings in dl_val:
        ratings = ratings.squeeze(1)
        users, items = indices[:,0], indices[:,1]
        output = model(users, items)
        loss = F.mse_loss(output, ratings)
        total_loss.append(loss.item())
    
    print(f"Test Loss: {sum(total_loss) / len(total_loss)}")
    
    
def trainer(model, num_epochs, lr=0.05):
    # Init optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    
    # Set model in Training mode
    model.train()
    
    # Start training loop
    for epoch in range(num_epochs):
        total_loss = []
        for indices, ratings in dl_train:
            ratings = ratings.squeeze(1)
            users, items = indices[:,0], indices[:,1]
            # Compute model output
            output = model(users, items)
            # Compute Loss
            loss = F.mse_loss(output, ratings)

            # Update model weights
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss.append(loss.item())
        
        # Print Error
        print(f"{epoch}/{num_epochs} - Loss: {sum(total_loss)/len(total_loss)}")
        
        # Test model at regular intervals
        if epoch % 10 == 0:
            # Test model
            test(model)
        

### Initialize Model

In [74]:
model = MatrixFactorization(num_users, num_movies, emb_size=100)

### Training

In [68]:
trainer(model, num_epochs=10)

0/10 - Loss: 4.208172657910516
Test Loss: 1.6839283564511467
1/10 - Loss: 1.56242457558127
2/10 - Loss: 1.1174142641179703
3/10 - Loss: 0.9379285048036015
4/10 - Loss: 0.8116016598308787
5/10 - Loss: 0.637713611125946
6/10 - Loss: 0.5123480242841384
7/10 - Loss: 0.4736304546103758
8/10 - Loss: 0.431275874376297
9/10 - Loss: 0.39761781166581545


In [69]:
test(model)

Test Loss: 0.3274755968767054


### Sanity Checks

In [70]:
indices, ratings = next(iter(dl_val))
ratings = ratings.squeeze(1)
users, items = indices[:,0], indices[:,1]
output = model(users, items)
loss = F.mse_loss(output, ratings)
print("Loss: ",loss.item())

Loss:  0.34479594230651855


In [71]:
ratings

tensor([2.5000, 2.5000, 3.0000,  ..., 3.0000, 5.0000, 2.0000])

In [72]:
output

tensor([2.7080, 2.4294, 2.4730,  ..., 3.3969, 5.1638, 1.7656],
       grad_fn=<SumBackward1>)