# Matrix Factorization
Dataset: movie lens 

In [None]:
import os
import pandas as pd
import torch 
import torch.nn as nn
import tqdm
import torch.optim as optim

In [None]:
# careful with absolute or relative paths
# main folder path
data_path = 'data/ml-32m/'

# csv file paths
links_path = os.path.join(data_path,"links.csv")
movies_path = os.path.join(data_path,"movies.csv")
ratings_path = os.path.join(data_path,"ratings.csv")
tags_path = os.path.join(data_path,"tags.csv")

# read csvs 
links_pd = pd.read_csv(links_path) # metadata stuff
movies_pd = pd.read_csv(movies_path)
ratings_pd = pd.read_csv(ratings_path)
tags_pd = pd.read_csv(tags_path)

In [3]:
movies_pd.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_pd.sample(10)

# Process the data 
user_tensor = torch.tensor(ratings_pd['userId'].values, dtype=torch.long)
movie_tensor = torch.tensor(ratings_pd['movieId'].values, dtype=torch.long)
rating_tensor = torch.tensor(ratings_pd['rating'].values, dtype=torch.float)

Unnamed: 0,userId,movieId,rating,timestamp
31031773,194846,21,3.0,1317133871
3410364,21447,333,3.5,1167344576
12471492,78176,208,4.0,836513594
21513865,134540,4518,4.0,1162755657
16663283,104458,7143,4.0,1559342598
6455253,40281,81834,5.0,1676582294
22498996,140924,5005,0.5,1196696049
13687815,85549,5679,2.5,1148315002
27621665,173280,1663,4.5,1276306732
5888833,36765,2762,4.5,1509570663


In [10]:
print(f"{len(ratings_pd['userId'].unique())} users and", f"{len(movies_pd['movieId'].unique())} movies")


200948 users and 87585 movies


In [6]:
tags_pd.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,Kevin Kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


So we have a 200K movies and around 100K users. A matrix for every movie give a user will result in a 2e10 elements. We will be unable to hold that amount of elements. Plus, initially, the matrix will be highly sparsed. How can we deal with that? answer: we don't really have to construct a matrix, we just need to get the embeddings for the users and items. 
# what are we trying to predict? 
We try to predict the unknown rating $r_{u,i}$ that user $u$ give to an item $i$.
# what metric are we trying to maximize? 
We actually try to minimize the different of the predicted ratings vs the actual ratings plus some regularization. 

In [None]:
# Create a pytorch dataset 
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(user_tensor, movie_tensor, rating_tensor)

In [None]:
# our tiny recomendation system
class tiny_recsys(nn.Module):
    def __init__(self, n_users,n_movies,embd_dimension):
        self.users_embeddings = nn.Embedding(n_users, embd_dimension, max_norm=1)
        self.movies_embeddings = nn.Embedding(n_movies, embd_dimension, max_norm=1)
    
    def forward(self, users_index, movies_index):
        users = self.users_embeddings[users_index] # shape: (batch_size, embd_dimension)
        movies = self.movies_embeddings[movies_index] # shape: (batch_size, embd_dimension)
        rankings = (users * movies).sum(dim=1) # elementwise multiply and sum over embd_dim
        return rankings


In [15]:
# compute number of workers to use locally
num_workers = min(4, os.cpu_count() // 2)
print(num_workers)


4


In [None]:
# training loop
n_users = len(ratings_pd['userId'].unique())
n_movies = len(movies_pd['movieId'].unique())
EMBEDDING_DIM = 32
NUM_EPOCHS = 10
BATCH_SIZE = 1024
LEARNING_RATE = 0.01

model = tiny_recsys(n_users, n_movies, EMBEDDING_DIM)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.MSELoss()
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers)

for epoch in range(NUM_EPOCHS):
    for user_batch, movie_batch, rating_batch in dataloader:
        user_batch = user_batch
        movie_batch = movie_batch
        rating_batch = rating_batch

        optimizer.zero_grad()
        preds = model(user_batch, movie_batch)
        loss = loss_fn(preds, rating_batch)
        loss.backward()
        optimizer.step()

