In [11]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import pandas as pd

from IPython import get_ipython # type: ignore
import os

torch.manual_seed(0)

# get the notebook name
ip = get_ipython()
path = None
if '__vsc_ipynb_file__' in ip.user_ns: # type: ignore
    path = ip.user_ns['__vsc_ipynb_file__'] # type: ignore

os.makedirs('models/', exist_ok=True)
model_file_name = f"models/{os.path.basename(path)[:-6]}.pt" # type: ignore

Data

In [12]:
movie_path = 'dataset/ml-latest-small/movies_with_wikidata_values.csv'
rating_path = 'dataset/ml-latest-small/ratings.csv'

ratings_df = pd.read_csv(rating_path)[["userId", "movieId", "rating"]]
movies_df = pd.read_csv(movie_path, index_col='movieId')

In [13]:
# One-hot encode the genres:
genres = movies_df['genres'].str.get_dummies('|').values
genres = torch.from_numpy(
    genres
    ).to(torch.float)

years = torch.nan_to_num(torch.from_numpy(
    movies_df['year'].values
    ).to(torch.float), nan=1990).unsqueeze(1)
box_office = torch.nan_to_num(torch.from_numpy(
    movies_df['boxOfficeWorldwide'].map(lambda x: x/1000000).values
    ).to(torch.float), nan=0).unsqueeze(1)
score = torch.from_numpy(
    movies_df['tommatometerScore'].map(lambda x: int(x[:-1] if isinstance(x, str) and len(x) > 1 else 50)).values
    ).to(torch.float).unsqueeze(1)
duration = torch.nan_to_num(torch.from_numpy(
    movies_df['duration'].values
    ).to(torch.float), nan=70).unsqueeze(1)

# Load the pre-trained sentence transformer model and encode the movie titles:
model = SentenceTransformer('all-MiniLM-L6-v2')
with torch.no_grad():
    titles = model.encode(movies_df['title'].tolist(), convert_to_tensor=True, show_progress_bar=True)
    titles = titles.cpu()

# Concatenate the genres and title features:
movie_features = torch.cat([genres, titles], dim=-1)
user_ids_tensor = torch.LongTensor(range(len(ratings_df['userId'].unique())))

Batches:   0%|          | 0/305 [00:00<?, ?it/s]

In [14]:
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedUserId': pd.RangeIndex(len(unique_user_id))
    })

# Create a mapping from the movieId to a unique consecutive value in the range [0, num_movies]:
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedMovieId': pd.RangeIndex(len(unique_movie_id))
    })

# Merge the mappings with the original data frame:
ratings_df = ratings_df.merge(unique_user_id, on='userId')
ratings_df = ratings_df.merge(unique_movie_id, on='movieId')

ratings_df_train, ratings_df_test = train_test_split(ratings_df, test_size=0.2, random_state=42)
ratings_df_test, ratings_df_val = train_test_split(ratings_df_test, test_size=0.5, random_state=42)

In [15]:
def create_data(ratings_df):
    edge_index = torch.stack([
    torch.tensor(ratings_df['mappedUserId'].values),
    torch.tensor(ratings_df['mappedMovieId'].values)]
    , dim=0)

    assert edge_index.shape == (2, len(ratings_df))
    data = HeteroData()
    # Add the user nodes:
    data['user'].x = user_ids_tensor  # [num_users, num_features_users]
    # Add the movie nodes:
    data['movie'].x = movie_features  # [num_movies, num_features_movies]
    # Add the rating edges:
    data['user', 'rates', 'movie'].edge_index = edge_index  # [2, num_ratings]
    # Add the rating labels:
    rating = torch.from_numpy(ratings_df['rating'].values).to(torch.float)
    data['user', 'rates', 'movie'].edge_label = rating  # [num_ratings]

    # We also need to make sure to add the reverse edges from movies to users
    # in order to let a GNN be able to pass messages in both directions.
    # We can leverage the `T.ToUndirected()` transform for this from PyG:
    data = T.ToUndirected()(data)

    # With the above transformation we also got reversed labels for the edges.
    # We are going to remove them:
    del data['movie', 'rev_rates', 'user'].edge_label

    assert data['user'].num_nodes == len(unique_user_id)
    assert data['user', 'rates', 'movie'].num_edges == len(ratings_df)

    return data

In [16]:
train_data, val_data, test_data = create_data(ratings_df_train), create_data(ratings_df_val), create_data(ratings_df_test)

In [28]:
class GraphEmbedding(torch.nn.Module):
    def __init__(self, num_users, user_embedding_dim):
        super().__init__()
        self.user_embedding = torch.nn.Embedding(num_users, user_embedding_dim)

    def forward(self, x_dict):
        x_dict['user'] = self.user_embedding(x_dict['user'])
        return x_dict

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels, user_dim):
        super().__init__()
        self.graph_embedding = GraphEmbedding(len(user_ids_tensor), user_dim)
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, train_data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        x_dict = self.graph_embedding(x_dict)
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Model(hidden_channels=32, user_dim=100).to(device)
print(model)

Model(
  (graph_embedding): GraphEmbedding(
    (user_embedding): Embedding(610, 100)
  )
  (encoder): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv((-1, -1), 32, aggr=mean)
      (movie__rev_rates__user): SAGEConv((-1, -1), 32, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv((-1, -1), 32, aggr=mean)
      (movie__rev_rates__user): SAGEConv((-1, -1), 32, aggr=mean)
    )
  )
  (decoder): EdgeDecoder(
    (lin1): Linear(in_features=64, out_features=32, bias=True)
    (lin2): Linear(in_features=32, out_features=1, bias=True)
  )
)


In [29]:
# Initialize optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

# Define training function
def train(model):
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'movie'].edge_index)
    target = train_data['user', 'movie'].edge_label
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()
    return float(loss)

# Define test function
@torch.no_grad()
def test(model, data):
    data = data.to(device)
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

best_val_rmse = float('inf')
best_epoch = 0

# Training loop
for epoch in range(1, 201):
    train_data = train_data.to(device) # type: ignore
    loss = train(model)
    train_rmse = test(model, train_data)
    val_rmse = test(model, val_data)

    # Check if the current validation RMSE is the best
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        torch.save(model.state_dict(), model_file_name)
        print(f'Saving model with val_rmse: {val_rmse:.4f} at epoch {epoch}')

    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, Val: {val_rmse:.4f}')

Saving model with val_rmse: 3.2175 at epoch 1
Epoch: 001, Loss: 11.9625, Train: 3.2511, Val: 3.2175
Saving model with val_rmse: 2.9568 at epoch 2
Epoch: 002, Loss: 10.5696, Train: 3.0031, Val: 2.9568
Saving model with val_rmse: 2.5973 at epoch 3
Epoch: 003, Loss: 9.0185, Train: 2.6512, Val: 2.5973
Saving model with val_rmse: 2.0966 at epoch 4
Epoch: 004, Loss: 7.0288, Train: 2.1440, Val: 2.0966
Saving model with val_rmse: 1.5031 at epoch 5
Epoch: 005, Loss: 4.5967, Train: 1.4998, Val: 1.5031
Saving model with val_rmse: 1.3251 at epoch 6
Epoch: 006, Loss: 2.2493, Train: 1.2084, Val: 1.3251
Epoch: 007, Loss: 1.5012, Train: 1.6417, Val: 1.6373
Epoch: 008, Loss: 3.5295, Train: 1.6639, Val: 1.6568
Epoch: 009, Loss: 3.5577, Train: 1.4462, Val: 1.5085
Saving model with val_rmse: 1.2524 at epoch 10
Epoch: 010, Loss: 2.2284, Train: 1.1465, Val: 1.2524
Saving model with val_rmse: 1.1460 at epoch 11
Epoch: 011, Loss: 1.3199, Train: 1.1001, Val: 1.1460
Epoch: 012, Loss: 1.2103, Train: 1.2355, Val:

In [30]:
print(f'Loading model with val_rmse: {best_val_rmse:.4f} at epoch {best_epoch}')
model.load_state_dict(torch.load(model_file_name))

print("Test rmse:", test(model, test_data))

Loading model with val_rmse: 0.9662 at epoch 116
Test rmse: 0.9572569727897644
