In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import pandas as pd

from IPython import get_ipython # type: ignore
import os

torch.manual_seed(0)

# get the notebook name
ip = get_ipython()
path = None
if '__vsc_ipynb_file__' in ip.user_ns: # type: ignore
    path = ip.user_ns['__vsc_ipynb_file__'] # type: ignore

os.makedirs('models/', exist_ok=True)
model_file_name = f"models/{os.path.basename(path)[:-6]}.pt" # type: ignore

Data

In [2]:
movie_path = 'dataset/ml-latest-small/movies_with_wikidata_values.csv'
rating_path = 'dataset/ml-latest-small/ratings.csv'

ratings_df = pd.read_csv(rating_path)[["userId", "movieId", "rating"]]
movies_df = pd.read_csv(movie_path, index_col='movieId')

In [3]:
# One-hot encode the genres:
genres = movies_df['genres'].str.get_dummies('|').values
genres = torch.from_numpy(
    genres
    ).to(torch.float)

years = torch.nan_to_num(torch.from_numpy(
    movies_df['year'].values
    ).to(torch.float), nan=1990).unsqueeze(1)
box_office = torch.nan_to_num(torch.from_numpy(
    movies_df['boxOfficeWorldwide'].map(lambda x: x/1000000).values
    ).to(torch.float), nan=0).unsqueeze(1)
score = torch.from_numpy(
    movies_df['tommatometerScore'].map(lambda x: int(x[:-1] if isinstance(x, str) and len(x) > 1 else 50)).values
    ).to(torch.float).unsqueeze(1)
duration = torch.nan_to_num(torch.from_numpy(
    movies_df['duration'].values
    ).to(torch.float), nan=70).unsqueeze(1)

# Load the pre-trained sentence transformer model and encode the movie titles:
model = SentenceTransformer('all-MiniLM-L6-v2')
with torch.no_grad():
    titles = model.encode(movies_df['title'].tolist(), convert_to_tensor=True, show_progress_bar=True)
    titles = titles.cpu()

# Concatenate the genres and title features:
movie_features = torch.cat([genres, titles], dim=-1)
user_ids_tensor = torch.LongTensor(range(len(ratings_df['userId'].unique())))

Batches:   0%|          | 0/305 [00:00<?, ?it/s]

In [4]:
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedUserId': pd.RangeIndex(len(unique_user_id))
    })

# Create a mapping from the movieId to a unique consecutive value in the range [0, num_movies]:
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedMovieId': pd.RangeIndex(len(unique_movie_id))
    })

# Merge the mappings with the original data frame:
ratings_df = ratings_df.merge(unique_user_id, on='userId')
ratings_df = ratings_df.merge(unique_movie_id, on='movieId')

ratings_df_train, ratings_df_test = train_test_split(ratings_df, test_size=0.2, random_state=42)
ratings_df_test, ratings_df_val = train_test_split(ratings_df_test, test_size=0.5, random_state=42)

In [5]:
def create_data(ratings_df):
    edge_index = torch.stack([
    torch.tensor(ratings_df['mappedUserId'].values),
    torch.tensor(ratings_df['mappedMovieId'].values)]
    , dim=0)

    assert edge_index.shape == (2, len(ratings_df))
    data = HeteroData()
    # Add the user nodes:
    data['user'].x = user_ids_tensor  # [num_users, num_features_users]
    # Add the movie nodes:
    data['movie'].x = movie_features  # [num_movies, num_features_movies]
    # Add the rating edges:
    data['user', 'rates', 'movie'].edge_index = edge_index  # [2, num_ratings]
    # Add the rating labels:
    rating = torch.from_numpy(ratings_df['rating'].values).to(torch.float)
    data['user', 'rates', 'movie'].edge_label = rating  # [num_ratings]

    # We also need to make sure to add the reverse edges from movies to users
    # in order to let a GNN be able to pass messages in both directions.
    # We can leverage the `T.ToUndirected()` transform for this from PyG:
    data = T.ToUndirected()(data)

    # With the above transformation we also got reversed labels for the edges.
    # We are going to remove them:
    del data['movie', 'rev_rates', 'user'].edge_label

    assert data['user'].num_nodes == len(unique_user_id)
    assert data['user', 'rates', 'movie'].num_edges == len(ratings_df)

    return data

In [6]:
train_data, val_data, test_data = create_data(ratings_df_train), create_data(ratings_df_val), create_data(ratings_df_test)

In [15]:
class GraphEmbedding(torch.nn.Module):
    def __init__(self, user_embedding_dim):
        super().__init__()
        self.user_embedding = torch.nn.Embedding(len(user_ids_tensor), user_embedding_dim)

    def forward(self, x_dict):
        x_dict['user'] = self.user_embedding(x_dict['user'])
        return x_dict

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.dropout = torch.nn.Dropout(p=0.1)
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)
        self.conv3 = SAGEConv((-1, -1), out_channels)
        self.conv4 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.dropout(x)
        x1 = self.conv1(x, edge_index).relu()
        x2 = x1 + self.conv2(x1, edge_index).relu()
        x3 = x2 + self.conv3(x2, edge_index).relu()
        x4 = x3 + self.conv4(x3, edge_index).relu()
        return x4

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels, user_dim, train_data_metadata):
        super().__init__()
        self.graph_embedding = GraphEmbedding(user_dim)
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, train_data_metadata, aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        x_dict = self.graph_embedding(x_dict)
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
def train_and_test_model(model: Model, train_data, val_data, test_data, verbose=False) -> tuple[Model, dict[str, float]]:
    # Initialize optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

    # Define training function
    def train(model):
        model.train()
        optimizer.zero_grad()
        pred = model(train_data.x_dict, train_data.edge_index_dict,
                    train_data['user', 'movie'].edge_index)
        target = train_data['user', 'movie'].edge_label
        loss = F.mse_loss(pred, target)
        loss.backward()
        optimizer.step()
        return float(loss)

    # Define test function
    @torch.no_grad()
    def test(model, data):
        data = data.to(device)
        model.eval()
        pred = model(data.x_dict, data.edge_index_dict,
                    data['user', 'movie'].edge_index)
        pred = pred.clamp(min=0, max=5)
        target = data['user', 'movie'].edge_label.float()
        rmse = F.mse_loss(pred, target).sqrt()
        return float(rmse)

    best_val_rmse = float('inf')
    best_epoch = 0

    # Training loop
    for epoch in range(1, 201):
        train_data = train_data.to(device) # type: ignore
        loss = train(model)
        train_rmse = test(model, train_data)
        val_rmse = test(model, val_data)

        # if epoch % 10 == 9:
        #     scheduler.step()

        # Check if the current validation RMSE is the best
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_epoch = epoch
            torch.save(model.state_dict(), model_file_name)
            if verbose:
                print(f'Saving model with val_rmse: {val_rmse:.4f} at epoch {epoch}')

        if verbose:
            print(f'Epoch: {epoch:03d}, LR: {next(iter(optimizer.param_groups))["lr"]}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, Val: {val_rmse:.4f}')

    # Test loop
    if verbose:
        print(f'Loading model with val_rmse: {best_val_rmse:.4f} at epoch {best_epoch}')
    model.load_state_dict(torch.load(model_file_name))
    test_rmse = test(model, test_data)
    if verbose:
        print("Test rmse:", test_rmse)

    return model, {
        'test_rmse': test_rmse,
        'best_val_rmse': best_val_rmse,
        'best_epoch': best_epoch
    }

In [17]:
def shuffle_dataframe(df, random_state=None):
    return df.sample(frac=1, random_state=random_state).reset_index(drop=True)

def kfold_cross_validation(ratings_df, k, random_state=42):
    # Shuffle the DataFrame
    ratings_df = shuffle_dataframe(ratings_df, random_state)

    # Split the data into k folds
    fold_size = len(ratings_df) // k
    folds = [ratings_df.iloc[i * fold_size: (i + 1) * fold_size] for i in range(k)]

    # Iterate over each fold
    for i in range(k):
        ratings_df_train = pd.concat([folds[j] for j in range(k) if j != i])
        ratings_df_test, ratings_df_val = train_test_split(folds[i], test_size=0.5, random_state=42)
        train_data, val_data, test_data = create_data(ratings_df_train), create_data(ratings_df_val), create_data(ratings_df_test)

        model = Model(hidden_channels=64, user_dim=512, train_data_metadata=train_data.metadata()).to(device)

        yield train_and_test_model(model, train_data, val_data, test_data, verbose=False)

In [14]:
scores = []
k = 5
for _, fold_scores in kfold_cross_validation(ratings_df, k):
    print(fold_scores)
    scores.append(fold_scores)
print('Average test rmse:', sum(t['test_rmse'] for t in scores)/k)
print('Average best val rmse:', sum(t['best_val_rmse'] for t in scores)/k)
print('Average best epoch:', sum(t['best_epoch'] for t in scores)/k)

{'test_rmse': 0.9428086876869202, 'best_val_rmse': 0.9540628790855408, 'best_epoch': 188}
{'test_rmse': 0.9406399130821228, 'best_val_rmse': 0.9325417876243591, 'best_epoch': 193}
{'test_rmse': 0.9273231029510498, 'best_val_rmse': 0.9221014976501465, 'best_epoch': 174}
{'test_rmse': 0.9493412375450134, 'best_val_rmse': 0.9396054744720459, 'best_epoch': 197}
{'test_rmse': 0.9480810165405273, 'best_val_rmse': 0.9387470483779907, 'best_epoch': 174}
Average test rmse: 0.9416387915611267
Average best val rmse: 0.9374117374420166
Average best epoch: 185.2
