In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch import nn, optim

# Load dataset
df = pd.read_csv('../../data/lens_tmdb/ratings_small.csv')

In [8]:
# Use the LabelEncoder from sklearn to encode the movie and user IDs
user_enc = LabelEncoder()
df['userId'] = user_enc.fit_transform(df['userId'].values)
n_users = df['userId'].nunique()

item_enc = LabelEncoder()
df['movieId'] = item_enc.fit_transform(df['movieId'].values)
n_movies = df['movieId'].nunique()


In [9]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_df = torch.tensor(train_df.values, dtype=torch.float32)
val_df = torch.tensor(val_df.values, dtype=torch.float32)


In [10]:
class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_factors=20):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)

    def forward(self, user, movie):
        return (self.user_factors(user) * self.movie_factors(movie)).sum(1)


In [15]:
import torch.nn.functional as F


def train(model, optimizer, data_loader):
    model.train()
    total_loss = 0
    for data in data_loader:
        user = data[:, 0].long()
        movie = data[:, 1].long()
        rating = data[:, 2].view(-1, 1)

        # Reset the gradients to zero
        optimizer.zero_grad()

        # Forward pass
        output = model(user, movie)
        loss = F.mse_loss(output, rating)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


def validate(model, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in data_loader:
            user = data[:, 0].long()
            movie = data[:, 1].long()
            rating = data[:, 2].view(-1, 1)

            output = model(user, movie)
            loss = F.mse_loss(output, rating)

            total_loss += loss.item()

    return total_loss / len(data_loader)


In [16]:
model = MatrixFactorization(n_users, n_movies)
optimizer = optim.Adam(model.parameters(), lr=0.01)
train_loader = torch.utils.data.DataLoader(train_df, batch_size=1024, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_df, batch_size=1024)

for epoch in range(10):
    train_loss = train(model, optimizer, train_loader)
    val_loss = validate(model, val_loader)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


  loss = F.mse_loss(output, rating)
  loss = F.mse_loss(output, rating)
  loss = F.mse_loss(output, rating)
  loss = F.mse_loss(output, rating)


Epoch 1, Train Loss: 29.2176, Val Loss: 25.2156
Epoch 2, Train Loss: 20.1802, Val Loss: 21.3025
Epoch 3, Train Loss: 15.9003, Val Loss: 18.9560
Epoch 4, Train Loss: 13.0065, Val Loss: 16.7279
Epoch 5, Train Loss: 10.1000, Val Loss: 13.6848
Epoch 6, Train Loss: 6.9374, Val Loss: 10.3133
Epoch 7, Train Loss: 4.4825, Val Loss: 7.9190
Epoch 8, Train Loss: 3.1242, Val Loss: 6.4963
Epoch 9, Train Loss: 2.4153, Val Loss: 5.6132
Epoch 10, Train Loss: 2.0221, Val Loss: 5.0168


In [17]:
def validate_metrics(model, data_loader):
    model.eval()
    total_mse_loss = 0
    total_mae_loss = 0
    total_count = 0
    with torch.no_grad():
        for data in data_loader:
            user = data[:, 0].long()
            movie = data[:, 1].long()
            rating = data[:, 2].view(-1, 1)

            output = model(user, movie)

            mse_loss = F.mse_loss(output, rating, reduction='sum')
            mae_loss = F.l1_loss(output, rating, reduction='sum')

            total_mse_loss += mse_loss.item()
            total_mae_loss += mae_loss.item()
            total_count += data.size(0)

    mean_mse_loss = total_mse_loss / total_count
    mean_mae_loss = total_mae_loss / total_count

    rmse = np.sqrt(mean_mse_loss)
    mae = mean_mae_loss

    return rmse, mae


In [20]:
for epoch in range(100):
    train_loss = train(model, optimizer, train_loader)
    rmse, mae = validate_metrics(model, val_loader)
    print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val RMSE: {rmse:.4f}, Val MAE: {mae:.4f}")


  loss = F.mse_loss(output, rating)
  loss = F.mse_loss(output, rating)
  mse_loss = F.mse_loss(output, rating, reduction='sum')
  mae_loss = F.l1_loss(output, rating, reduction='sum')
  mse_loss = F.mse_loss(output, rating, reduction='sum')
  mae_loss = F.l1_loss(output, rating, reduction='sum')


Epoch 1, Train Loss: 1.1893, Val RMSE: 54.7467, Val MAE: 1233.4891
Epoch 2, Train Loss: 1.1798, Val RMSE: 54.3114, Val MAE: 1223.0815
Epoch 3, Train Loss: 1.1736, Val RMSE: 53.9109, Val MAE: 1212.8535
Epoch 4, Train Loss: 1.1644, Val RMSE: 53.5469, Val MAE: 1203.7259
Epoch 5, Train Loss: 1.1581, Val RMSE: 53.2244, Val MAE: 1194.0782
Epoch 6, Train Loss: 1.1555, Val RMSE: 52.9234, Val MAE: 1187.0155
Epoch 7, Train Loss: 1.1499, Val RMSE: 52.6812, Val MAE: 1180.4386
Epoch 8, Train Loss: 1.1487, Val RMSE: 52.4161, Val MAE: 1174.7979
Epoch 9, Train Loss: 1.1454, Val RMSE: 52.2127, Val MAE: 1169.1425
Epoch 10, Train Loss: 1.1436, Val RMSE: 52.0071, Val MAE: 1164.2657
Epoch 11, Train Loss: 1.1373, Val RMSE: 51.8290, Val MAE: 1158.2164
Epoch 12, Train Loss: 1.1389, Val RMSE: 51.6682, Val MAE: 1155.7637
Epoch 13, Train Loss: 1.1397, Val RMSE: 51.5033, Val MAE: 1150.7303
Epoch 14, Train Loss: 1.1351, Val RMSE: 51.3652, Val MAE: 1148.0681
Epoch 15, Train Loss: 1.1362, Val RMSE: 51.2347, Val MAE: