In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch import nn, optim

In [2]:
# ratings = pd.read_csv('../../data/lens_tmdb/ratings.csv')
ratings = pd.read_csv('../../data/lens_tmdb/ratings_small.csv')

In [3]:
ratings.shape

(100004, 4)

In [4]:
# Convert userId and movieId to categorical
ratings['userId'] = ratings['userId'].astype('category')
ratings['movieId'] = ratings['movieId'].astype('category')

# Define the number of users and movies
num_users = ratings['userId'].nunique()
num_movies = ratings['movieId'].nunique()

# Create a map for user & movie to categorical codes
user_map = {i: user_code for i, user_code in enumerate(ratings['userId'].cat.categories)}
movie_map = {i: movie_code for i, movie_code in enumerate(ratings['movieId'].cat.categories)}

In [5]:
# Train test split
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

In [11]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, hidden_layers=[100], dropout=False):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.hidden_layers = nn.ModuleList()
        input_size = 2 * emb_size

        for hidden_layer_size in hidden_layers:
            self.hidden_layers.append(nn.Linear(input_size, hidden_layer_size))
            input_size = hidden_layer_size

        self.output_layer = nn.Linear(input_size, 1)
        self.dropout = dropout

    def forward(self, user_indices, item_indices):
        user_embedding = self.user_emb(user_indices)
        item_embedding = self.item_emb(item_indices)
        x = torch.cat([user_embedding, item_embedding], dim=-1)  # concatenate user and item embeddings

        for hidden_layer in self.hidden_layers:
            x = hidden_layer(x)
            x = torch.relu(x)
            if self.dropout:
                x = nn.Dropout()(x)

        x = self.output_layer(x)
        return 1 + 4*torch.sigmoid(x).view(-1)  # This scales the sigmoid output to the range [1, 5]


In [12]:
# Create the model
model = NCF(num_users=num_users, num_items=num_movies, emb_size=100, hidden_layers=[128, 64, 32], dropout=True)


# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [13]:
# Convert training data to tensors
train_user_tensor = torch.from_numpy(train_ratings['userId'].cat.codes.values).long()
train_movie_tensor = torch.from_numpy(train_ratings['movieId'].cat.codes.values).long()
train_rating_tensor = torch.from_numpy(train_ratings['rating'].values).float()

from tqdm import tqdm
# Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    prediction = model(train_user_tensor, train_movie_tensor)
    loss = criterion(prediction, train_rating_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 1 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, loss.item()))



Epoch [1/100], Loss: 1.7110
Epoch [2/100], Loss: 1.6497
Epoch [3/100], Loss: 1.5940
Epoch [4/100], Loss: 1.5433
Epoch [5/100], Loss: 1.4977
Epoch [6/100], Loss: 1.4577
Epoch [7/100], Loss: 1.4147
Epoch [8/100], Loss: 1.3782
Epoch [9/100], Loss: 1.3408
Epoch [10/100], Loss: 1.3075
Epoch [11/100], Loss: 1.2783
Epoch [12/100], Loss: 1.2481
Epoch [13/100], Loss: 1.2258
Epoch [14/100], Loss: 1.2109
Epoch [15/100], Loss: 1.1996
Epoch [16/100], Loss: 1.1901
Epoch [17/100], Loss: 1.1879
Epoch [18/100], Loss: 1.1850
Epoch [19/100], Loss: 1.1894
Epoch [20/100], Loss: 1.1859
Epoch [21/100], Loss: 1.1833
Epoch [22/100], Loss: 1.1755
Epoch [23/100], Loss: 1.1643
Epoch [24/100], Loss: 1.1583
Epoch [25/100], Loss: 1.1445
Epoch [26/100], Loss: 1.1372
Epoch [27/100], Loss: 1.1308
Epoch [28/100], Loss: 1.1208
Epoch [29/100], Loss: 1.1161
Epoch [30/100], Loss: 1.1078
Epoch [31/100], Loss: 1.1065
Epoch [32/100], Loss: 1.1046
Epoch [33/100], Loss: 1.0992
Epoch [34/100], Loss: 1.0965
Epoch [35/100], Loss: 1

In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Convert test data to tensors
test_user_tensor = torch.from_numpy(test_ratings['userId'].cat.codes.values).long()
test_movie_tensor = torch.from_numpy(test_ratings['movieId'].cat.codes.values).long()
test_rating_tensor = torch.from_numpy(test_ratings['rating'].values).float()

# Get model's predictions on the test set
model.eval()  # Switch to evaluation mode
with torch.no_grad():
    predictions = model(test_user_tensor, test_movie_tensor)

# Convert predictions to a numpy array
predictions_np = predictions.numpy()

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_rating_tensor, predictions_np))
print(f'Test RMSE: {rmse}')

# Calculate MAE
mae = mean_absolute_error(test_rating_tensor, predictions_np)
print(f'Test MAE: {mae}')


Test RMSE: 0.9425989985466003
Test MAE: 0.7255163788795471
