In [1]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from sklearn.model_selection import train_test_split
import pandas as pd

from IPython import get_ipython # type: ignore
import os

torch.manual_seed(0)

# get the notebook name
ip = get_ipython()
path = None
if '__vsc_ipynb_file__' in ip.user_ns: # type: ignore
    path = ip.user_ns['__vsc_ipynb_file__'] # type: ignore

os.makedirs('models/', exist_ok=True)
model_file_name = f"models/{os.path.basename(path)[:-6]}.pt" # type: ignore

Data

In [2]:
movie_path = 'dataset/ml-latest-small/movies_with_wikidata_values.csv'
rating_path = 'dataset/ml-latest-small/ratings.csv'
directed_by_path = 'dataset/ml-latest-small/directed_by.csv'
stars_path = 'dataset/ml-latest-small/stars.csv'
produced_by_path = 'dataset/ml-latest-small/produced_by.csv'
produced_in_path = 'dataset/ml-latest-small/produced_in.csv'

movies_df = pd.read_csv(movie_path, index_col='movieId')
ratings_df = pd.read_csv(rating_path)[["userId", "movieId", "rating"]]

directed_by_df = pd.read_csv(directed_by_path)[["movieId", "director"]]
stars_df = pd.read_csv(stars_path)[["movieId", "castMember"]]
produced_by_df = pd.read_csv(produced_by_path)[["movieId", "company"]]
produced_in_df = pd.read_csv(produced_in_path)[["movieId", "country"]]

In [3]:
movie_ids_tensor = torch.LongTensor(range(len(ratings_df['movieId'].unique())))
user_ids_tensor = torch.LongTensor(range(len(ratings_df['userId'].unique())))

director_ids_tensor = torch.LongTensor(range(len(directed_by_df['director'].unique())))
star_ids_tensor = torch.LongTensor(range(len(stars_df['castMember'].unique())))
company_ids_tensor = torch.LongTensor(range(len(produced_by_df['company'].unique())))
country_ids_tensor = torch.LongTensor(range(len(produced_in_df['country'].unique())))

genres_to_id_dict = {}
genres_df_list = []
for movie_id, row in movies_df.iterrows():
    for genre in row['genres'].split('|'):
        if genre not in genres_to_id_dict:
            genre_id = len(genres_to_id_dict)
            genres_to_id_dict[genre] = genre_id
        else:
            genre_id = genres_to_id_dict[genre]
        genres_df_list.append((movie_id, genre_id))

genres_df = pd.DataFrame(genres_df_list, columns=['movieId', 'genre'])
genre_ids_tensor = torch.LongTensor(range(len(genres_to_id_dict)))

In [4]:
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedUserId': pd.RangeIndex(len(unique_user_id))
    })

# Create a mapping from the movieId to a unique consecutive value in the range [0, num_movies]:
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedMovieId': pd.RangeIndex(len(unique_movie_id))
    })

unique_director_id = directed_by_df['director'].unique()
unique_director_id = pd.DataFrame(data={
    'director': unique_director_id,
    'mappedDirector': pd.RangeIndex(len(unique_director_id))
    })

unique_star_id = stars_df['castMember'].unique()
unique_star_id = pd.DataFrame(data={
    'castMember': unique_star_id,
    'mappedCastMember': pd.RangeIndex(len(unique_star_id))
    })

unique_company_id = produced_by_df['company'].unique()
unique_company_id = pd.DataFrame(data={
    'company': unique_company_id,
    'mappedCompany': pd.RangeIndex(len(unique_company_id))
    })

unique_country_id = produced_in_df['country'].unique()
unique_country_id = pd.DataFrame(data={
    'country': unique_country_id,
    'mappedCountry': pd.RangeIndex(len(unique_country_id))
    })

# Merge the mappings with the original data frame:
ratings_df = ratings_df.merge(unique_user_id, on='userId')
ratings_df = ratings_df.merge(unique_movie_id, on='movieId')

stars_df = stars_df.merge(unique_movie_id, on='movieId').merge(unique_star_id, on='castMember')
directed_by_df = directed_by_df.merge(unique_movie_id, on='movieId').merge(unique_director_id, on='director')
produced_by_df = produced_by_df.merge(unique_movie_id, on='movieId').merge(unique_company_id, on='company')
produced_in_df = produced_in_df.merge(unique_movie_id, on='movieId').merge(unique_country_id, on='country')
genres_df = genres_df.merge(unique_movie_id, on='movieId')

ratings_df_train, ratings_df_test = train_test_split(ratings_df, test_size=0.2, random_state=42)
ratings_df_test, ratings_df_val = train_test_split(ratings_df_test, test_size=0.5, random_state=42)

In [5]:
def create_data(ratings_df):
    edge_index = torch.stack([
    torch.tensor(ratings_df['mappedUserId'].values),
    torch.tensor(ratings_df['mappedMovieId'].values)]
    , dim=0)

    directed_by_edge_index = torch.stack([
    torch.tensor(directed_by_df['mappedMovieId'].values),
    torch.tensor(directed_by_df['mappedDirector'].values)]
    , dim=0)

    stars_edge_index = torch.stack([
    torch.tensor(stars_df['mappedMovieId'].values),
    torch.tensor(stars_df['mappedCastMember'].values)]
    , dim=0)

    genres_edge_index = torch.stack([
    torch.tensor(genres_df['mappedMovieId'].values),
    torch.tensor(genres_df['genre'].values)]
    , dim=0)

    produced_by_edge_index = torch.stack([
    torch.tensor(produced_by_df['mappedMovieId'].values),
    torch.tensor(produced_by_df['mappedCompany'].values)]
    , dim=0)

    produced_in_edge_index = torch.stack([
    torch.tensor(produced_in_df['mappedMovieId'].values),
    torch.tensor(produced_in_df['mappedCountry'].values)]
    , dim=0)

    assert edge_index.shape == (2, len(ratings_df))
    data = HeteroData()
    # Add the user nodes:
    data['user'].x = user_ids_tensor # [num_users]
    # Add the movie nodes:
    data['movie'].x = movie_ids_tensor  # [num_movies]
    # Add the directors nodes:
    data['director'].x = director_ids_tensor # [num_directors]
    # Add the stars nodes:
    data['star'].x = star_ids_tensor # [num_stars]
    # Add the genres nodes:
    data['genre'].x = genre_ids_tensor # [num_genres]
    # Add the company nodes:
    data['company'].x = company_ids_tensor # [num_companies]
    # Add the country nodes:
    data['country'].x = country_ids_tensor # [num_countries]

    # Add the rating edges:
    data['user', 'rates', 'movie'].edge_index = edge_index  # [2, num_ratings]
    # Add the rating labels:
    rating = torch.from_numpy(ratings_df['rating'].values).to(torch.float)
    data['user', 'rates', 'movie'].edge_label = rating  # [num_ratings]

    # Add the directed by edges:
    data['movie', 'directed_by', 'director'].edge_index = directed_by_edge_index
    # Add the stars edges:
    data['movie', 'stars', 'star'].edge_index = stars_edge_index
    # Add the genre edges:
    data['movie', 'has_genre', 'genre'].edge_index = genres_edge_index
    # Add the company edges:
    data['movie', 'produced_by', 'company'].edge_index = produced_by_edge_index
    # Add the country edges:
    data['movie', 'produced_in', 'country'].edge_index = produced_in_edge_index

    # We also need to make sure to add the reverse edges from movies to users
    # in order to let a GNN be able to pass messages in both directions.
    # We can leverage the `T.ToUndirected()` transform for this from PyG:
    data = T.ToUndirected()(data)

    # With the above transformation we also got reversed labels for the edges.
    # We are going to remove them:
    del data['movie', 'rev_rates', 'user'].edge_label

    assert data['user'].num_nodes == len(unique_user_id)
    assert data['user', 'rates', 'movie'].num_edges == len(ratings_df)

    return data

In [6]:
train_data, val_data, test_data = create_data(ratings_df_train), create_data(ratings_df_val), create_data(ratings_df_test)

In [14]:
class GraphEmbedding(torch.nn.Module):
    def __init__(self, user_embedding_dim, movie_embedding_dim, meta_embedding_dim):
        super().__init__()
        self.user_embedding = torch.nn.Embedding(len(user_ids_tensor), user_embedding_dim)
        self.movie_embedding = torch.nn.Embedding(len(movie_ids_tensor), movie_embedding_dim)
        self.director_embedding = torch.nn.Embedding(len(director_ids_tensor), meta_embedding_dim)
        self.star_embedding = torch.nn.Embedding(len(star_ids_tensor), meta_embedding_dim)
        self.genre_embedding = torch.nn.Embedding(len(genre_ids_tensor), meta_embedding_dim)
        self.company_embedding = torch.nn.Embedding(len(company_ids_tensor), meta_embedding_dim)
        self.country_embedding = torch.nn.Embedding(len(country_ids_tensor), meta_embedding_dim)

    def forward(self, x_dict):
        x_dict['user'] = self.user_embedding(x_dict['user'])
        x_dict['movie'] = self.movie_embedding(x_dict['movie'])
        x_dict['director'] = self.director_embedding(x_dict['director'])
        x_dict['star'] = self.star_embedding(x_dict['star'])
        x_dict['genre'] = self.genre_embedding(x_dict['genre'])
        x_dict['company'] = self.company_embedding(x_dict['company'])
        x_dict['country'] = self.country_embedding(x_dict['country'])
        return x_dict

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels, user_dim, movie_dim, meta_dim):
        super().__init__()
        self.graph_embedding = GraphEmbedding(user_dim, movie_dim, meta_dim)
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, train_data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        x_dict = self.graph_embedding(x_dict)
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Model(hidden_channels=64, user_dim=512, movie_dim=512, meta_dim=512).to(device)
print(model)

Model(
  (graph_embedding): GraphEmbedding(
    (user_embedding): Embedding(610, 512)
    (movie_embedding): Embedding(9724, 512)
    (director_embedding): Embedding(4302, 512)
    (star_embedding): Embedding(43410, 512)
    (genre_embedding): Embedding(20, 512)
    (company_embedding): Embedding(1798, 512)
    (country_embedding): Embedding(103, 512)
  )
  (encoder): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv((-1, -1), 64, aggr=mean)
      (movie__directed_by__director): SAGEConv((-1, -1), 64, aggr=mean)
      (movie__stars__star): SAGEConv((-1, -1), 64, aggr=mean)
      (movie__has_genre__genre): SAGEConv((-1, -1), 64, aggr=mean)
      (movie__produced_by__company): SAGEConv((-1, -1), 64, aggr=mean)
      (movie__produced_in__country): SAGEConv((-1, -1), 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv((-1, -1), 64, aggr=mean)
      (director__rev_directed_by__movie): SAGEConv((-1, -1), 64, aggr=mean)
      (star__rev_stars__movie): SAGEConv((-1

In [15]:
# Initialize optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Define training function
def train(model):
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'movie'].edge_index)
    target = train_data['user', 'movie'].edge_label
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()
    return float(loss)

# Define test function
@torch.no_grad()
def test(model, data):
    data = data.to(device)
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

best_val_rmse = float('inf')
best_epoch = 0

# Training loop
for epoch in range(1, 201):
    train_data = train_data.to(device) # type: ignore
    loss = train(model)
    train_rmse = test(model, train_data)
    val_rmse = test(model, val_data)

    # Check if the current validation RMSE is the best
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        best_epoch = epoch
        torch.save(model.state_dict(), model_file_name)
        print(f'Saving model with val_rmse: {val_rmse:.4f} at epoch {epoch}')

    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, Val: {val_rmse:.4f}')

Saving model with val_rmse: 2.1989 at epoch 1
Epoch: 001, Loss: 11.4809, Train: 2.2084, Val: 2.1989
Saving model with val_rmse: 1.3119 at epoch 2
Epoch: 002, Loss: 4.8769, Train: 1.2950, Val: 1.3119
Epoch: 003, Loss: 1.6771, Train: 1.5840, Val: 1.6204
Epoch: 004, Loss: 3.3690, Train: 1.5791, Val: 1.6152
Epoch: 005, Loss: 3.2266, Train: 1.3049, Val: 1.3448
Saving model with val_rmse: 1.1023 at epoch 6
Epoch: 006, Loss: 1.7444, Train: 1.0757, Val: 1.1023
Epoch: 007, Loss: 1.1571, Train: 1.1894, Val: 1.1964
Epoch: 008, Loss: 1.4146, Train: 1.3287, Val: 1.3274
Epoch: 009, Loss: 1.7655, Train: 1.3522, Val: 1.3494
Epoch: 010, Loss: 1.8284, Train: 1.2676, Val: 1.2679
Epoch: 011, Loss: 1.6069, Train: 1.1281, Val: 1.1361
Saving model with val_rmse: 1.0359 at epoch 12
Epoch: 012, Loss: 1.2725, Train: 1.0152, Val: 1.0359
Epoch: 013, Loss: 1.0322, Train: 1.0099, Val: 1.0433
Epoch: 014, Loss: 1.0288, Train: 1.0833, Val: 1.1230
Epoch: 015, Loss: 1.2059, Train: 1.1235, Val: 1.1652
Epoch: 016, Loss: 1

In [13]:
# model = Model(hidden_channels=32, user_dim=150, movie_dim=400, meta_dim=150).to(device) # lr = 0.004
print(f'Loading model with val_rmse: {best_val_rmse:.4f} at epoch {best_epoch}')
model.load_state_dict(torch.load(model_file_name))

print("Test rmse:", test(model, test_data))

Loading model with val_rmse: 0.9141 at epoch 72
Test rmse: 0.9046316146850586


In [16]:
# model = Model(hidden_channels=64, user_dim=512, movie_dim=512, meta_dim=512).to(device) # lr = 0.001
print(f'Loading model with val_rmse: {best_val_rmse:.4f} at epoch {best_epoch}')
model.load_state_dict(torch.load(model_file_name))

print("Test rmse:", test(model, test_data))

Loading model with val_rmse: 0.8947 at epoch 82
Test rmse: 0.8835242986679077
