### Recommendation Using Graph Neural Networks

In [1]:
import torch
import pandas as pd
import numpy as np
from torch.nn import Linear
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer

import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero

from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [2]:
from neo4j import GraphDatabase

url= 'bolt://localhost:7687'
user = 'neo4j'
password = 'password'

driver = GraphDatabase.driver(url, auth=(user, password))

def fetch_data(query, params={}):
  with driver.session(database = 'neo4j') as session:
    result = session.run(query, params)
    return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [3]:
fetch_data("""
CALL gds.graph.project('movies', ['Movie', 'Person'], 
  {ACTED_IN: {orientation:'UNDIRECTED'}, DIRECTED: {orientation:'UNDIRECTED'}})
""")

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'Movie': {'label': 'Movie', 'properties': {}}...","{'ACTED_IN': {'orientation': 'UNDIRECTED', 'ag...",movies,93004,370880,288


In [4]:
fetch_data("""
CALL gds.fastRP.write('movies', {writeProperty:'fastrp', embeddingDimension:56})
""")

Unnamed: 0,nodeCount,nodePropertiesWritten,preProcessingMillis,computeMillis,writeMillis,configuration
0,93004,93004,0,94,609,"{'writeConcurrency': 4, 'nodeSelfInfluence': 0..."


In [5]:
def load_node(cypher, index_col, encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    df.set_index(index_col, inplace=True)
    # Define node mapping
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    # Define node features
    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

In [6]:
def load_edge(cypher, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):
    # Execute the cypher query and retrieve data from Neo4j
    df = fetch_data(cypher)
    # Define edge index
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])
    # Define edge features
    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr

In [7]:
class SequenceEncoder(object):
    # The 'SequenceEncoder' encodes raw column strings into embeddings.
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [8]:
class GenresEncoder(object):
    # The 'GenreEncoder' splits the raw column strings by 'sep' and converts
    # individual elements to categorical labels.
    def __init__(self, sep='|'):
        self.sep = sep

    def __call__(self, df):
        genres = set(g for col in df.values for g in col.split(self.sep))
        mapping = {genre: i for i, genre in enumerate(genres)}

        x = torch.zeros(len(df), len(mapping))
        for i, col in enumerate(df.values):
            for genre in col.split(self.sep):
                x[i, mapping[genre]] = 1
        return x

In [9]:
class IdentityEncoder(object):
    # The 'IdentityEncoder' takes the raw column values and converts them to
    # PyTorch tensors.
    def __init__(self, dtype=None, is_list=False):
        self.dtype = dtype
        self.is_list = is_list

    def __call__(self, df):
        if self.is_list:
            return torch.stack([torch.tensor(el) for el in df.values])
        return torch.from_numpy(df.values).to(self.dtype)

In [10]:
user_query = """
MATCH (u:User) RETURN u.userId AS userId
"""

user_x, user_mapping = load_node(user_query, index_col='userId')

In [11]:
movie_query = """
MATCH (m:Movie)-[:IN_GENRE]->(genre:Genre)
WITH m, collect(genre.name) AS genres_list
RETURN m.movieId AS movieId, m.title AS title, apoc.text.join(genres_list, '|') AS genres, m.fastrp AS fastrp
"""

movie_x, movie_mapping = load_node(
    movie_query, 
    index_col='movieId', encoders={
        'title': SequenceEncoder(),
        'genres': GenresEncoder(),
        'fastrp': IdentityEncoder(is_list=True)
    })

Batches:   0%|          | 0/283 [00:00<?, ?it/s]

In [12]:
rating_query = """
MATCH (u:User)-[r:RATED]->(m:Movie) 
RETURN u.userId AS userId, m.movieId AS movieId, r.rating AS rating
"""

edge_index, edge_label = load_edge(
    rating_query,
    src_index_col='userId',
    src_mapping=user_mapping,
    dst_index_col='movieId',
    dst_mapping=movie_mapping,
    encoders={'rating': IdentityEncoder(dtype=torch.long)},
)

In [13]:
data = HeteroData()
# Add user node features for message passing:
data['user'].x = torch.eye(len(user_mapping), device=device)
# Add movie node features
data['movie'].x = movie_x
# Add ratings between users and movies
data['user', 'rates', 'movie'].edge_index = edge_index
data['user', 'rates', 'movie'].edge_label = edge_label
data.to(device, non_blocking=True)

HeteroData(
  [1muser[0m={ x=[671, 671] },
  [1mmovie[0m={ x=[9047, 460] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 79522],
    edge_label=[79522]
  }
)

In [14]:
data = ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)
train_data, val_data, test_data = transform(data)

In [15]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [16]:
weight = torch.bincount(train_data['user', 'movie'].edge_label)
weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [17]:
model = Model(hidden_channels=64).to(device)

In [18]:
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [19]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'rates', 'movie'].edge_label_index)
    target = train_data['user', 'rates', 'movie'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [20]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'rates', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'rates', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [21]:
for epoch in range(1, 300):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 18.9491, Train: 2.9724, Val: 2.9882, Test: 2.9685
Epoch: 002, Loss: 13.8319, Train: 1.6203, Val: 1.6396, Test: 1.6231
Epoch: 003, Loss: 6.5224, Train: 1.9350, Val: 1.8984, Test: 1.9186
Epoch: 004, Loss: 30.0833, Train: 1.1109, Val: 1.1131, Test: 1.1093
Epoch: 005, Loss: 7.4977, Train: 2.0267, Val: 2.0450, Test: 2.0273
Epoch: 006, Loss: 7.6556, Train: 2.6342, Val: 2.6499, Test: 2.6310
Epoch: 007, Loss: 11.1286, Train: 2.8713, Val: 2.8859, Test: 2.8668
Epoch: 008, Loss: 12.9613, Train: 2.9398, Val: 2.9543, Test: 2.9351
Epoch: 009, Loss: 13.5375, Train: 2.9298, Val: 2.9444, Test: 2.9253
Epoch: 010, Loss: 13.4481, Train: 2.8695, Val: 2.8846, Test: 2.8657
Epoch: 011, Loss: 12.9363, Train: 2.7591, Val: 2.7748, Test: 2.7562
Epoch: 012, Loss: 12.0402, Train: 2.5818, Val: 2.5983, Test: 2.5804
Epoch: 013, Loss: 10.7137, Train: 2.2963, Val: 2.3144, Test: 2.2975
Epoch: 014, Loss: 8.8729, Train: 1.8663, Val: 1.8863, Test: 1.8716
Epoch: 015, Loss: 6.8205, Train: 1.3269, Val: 1.3466

Epoch: 124, Loss: 2.7969, Train: 1.1010, Val: 1.1443, Test: 1.1484
Epoch: 125, Loss: 2.7885, Train: 1.0992, Val: 1.1431, Test: 1.1471
Epoch: 126, Loss: 2.7801, Train: 1.0976, Val: 1.1420, Test: 1.1460
Epoch: 127, Loss: 2.7719, Train: 1.0969, Val: 1.1418, Test: 1.1458
Epoch: 128, Loss: 2.7639, Train: 1.0964, Val: 1.1417, Test: 1.1456
Epoch: 129, Loss: 2.7559, Train: 1.0952, Val: 1.1410, Test: 1.1447
Epoch: 130, Loss: 2.7482, Train: 1.0939, Val: 1.1404, Test: 1.1440
Epoch: 131, Loss: 2.7404, Train: 1.0929, Val: 1.1399, Test: 1.1434
Epoch: 132, Loss: 2.7330, Train: 1.0918, Val: 1.1394, Test: 1.1429
Epoch: 133, Loss: 2.7255, Train: 1.0911, Val: 1.1392, Test: 1.1425
Epoch: 134, Loss: 2.7181, Train: 1.0901, Val: 1.1387, Test: 1.1419
Epoch: 135, Loss: 2.7111, Train: 1.0890, Val: 1.1382, Test: 1.1413
Epoch: 136, Loss: 2.7039, Train: 1.0879, Val: 1.1377, Test: 1.1407
Epoch: 137, Loss: 2.6969, Train: 1.0870, Val: 1.1375, Test: 1.1404
Epoch: 138, Loss: 2.6901, Train: 1.0865, Val: 1.1374, Test: 1.

Epoch: 247, Loss: 2.1220, Train: 1.0117, Val: 1.1264, Test: 1.1280
Epoch: 248, Loss: 2.1288, Train: 0.9460, Val: 1.0615, Test: 1.0647
Epoch: 249, Loss: 2.1221, Train: 1.0200, Val: 1.1231, Test: 1.1285
Epoch: 250, Loss: 2.1818, Train: 0.9596, Val: 1.0770, Test: 1.0793
Epoch: 251, Loss: 2.1960, Train: 1.0466, Val: 1.1597, Test: 1.1616
Epoch: 252, Loss: 2.2348, Train: 0.9422, Val: 1.0593, Test: 1.0610
Epoch: 253, Loss: 2.1808, Train: 1.0341, Val: 1.1319, Test: 1.1376
Epoch: 254, Loss: 2.2712, Train: 0.9596, Val: 1.0759, Test: 1.0780
Epoch: 255, Loss: 2.1053, Train: 0.9846, Val: 1.1017, Test: 1.1036
Epoch: 256, Loss: 2.1356, Train: 1.0288, Val: 1.1426, Test: 1.1452
Epoch: 257, Loss: 2.1451, Train: 0.9319, Val: 1.0492, Test: 1.0524
Epoch: 258, Loss: 2.1635, Train: 1.0220, Val: 1.1236, Test: 1.1289
Epoch: 259, Loss: 2.2336, Train: 0.9815, Val: 1.1010, Test: 1.1010
Epoch: 260, Loss: 2.0880, Train: 0.9623, Val: 1.0818, Test: 1.0841
Epoch: 261, Loss: 2.1994, Train: 1.0732, Val: 1.1847, Test: 1.

In [22]:
num_movies = len(movie_mapping)
num_users = len(user_mapping)

reverse_movie_mapping = dict(zip(movie_mapping.values(),movie_mapping.keys()))
reverse_user_mapping = dict(zip(user_mapping.values(),user_mapping.keys()))

results = []

for user_id in range(0,num_users): 

    row = torch.tensor([user_id] * num_movies)
    col = torch.arange(num_movies)
    edge_label_index = torch.stack([row, col], dim=0)

    pred = model(data.x_dict, data.edge_index_dict,
                 edge_label_index)
    pred = pred.clamp(min=0, max=5)

    user_neo4j_id = reverse_user_mapping[user_id]

    mask = (pred == 5).nonzero(as_tuple=True)

    ten_predictions = [reverse_movie_mapping[el] for el in  mask[0].tolist()[:20]]
    results.append({'user': user_neo4j_id, 'movies': ten_predictions})
    

In [23]:
import_predictions_query = """
UNWIND $data AS row
MATCH (u:User {userId: row.user})
WITH u, row
UNWIND row.movies AS movieId
MATCH (m:Movie {movieId: movieId})
WITH u,m
// filter out existing links
WHERE NOT (u)-[:RATED]->(m)
MERGE (u)-[:RECOMMEND]->(m)
"""

fetch_data(import_predictions_query, {'data': results})

In [29]:
col = fetch_data("""
MATCH (u:User{userId: $user})-[r:RECOMMEND]->(m) RETURN u.userId AS userId,m.movieId AS movieId,m.title AS title LIMIT 25
""", {'user':14})

In [30]:
col

Unnamed: 0,userId,movieId,title
0,14,90061,The Myth of the American Sleepover
1,14,143859,"Hail, Caesar!"
2,14,4584,Dream a Little Dream
3,14,61250,The House Bunny
4,14,114265,Laggies
5,14,33312,The Cocoanuts
6,14,94939,Sound of Noise
7,14,105593,Filth
8,14,91690,Friends with Kids
9,14,67788,Confessions of a Shopaholic


In [26]:
test = pd.read_csv('liked_test_set.csv')

In [27]:
result_df = pd.merge(col,test,on=['userId','movieId'])
result_df

Unnamed: 0,userId,movieId,title,rating


In [28]:
test.head(30)

Unnamed: 0,userId,movieId,rating
0,1,1029,3.0
1,2,50,4.0
2,2,551,5.0
3,2,153,4.0
4,2,10,4.0
5,2,349,4.0
6,2,589,5.0
7,2,150,5.0
8,3,5669,3.5
9,3,44191,3.5
