# Context:
### 1) Creating a recommendation system which recommends new investment opportunities to investors based on their previous history of investments that were made.
### 2) Investors and Borrowers are 2 different nodes
### 3) The relationship has a weight parameter (0-100), which tells us how much share one investor had in a particular investment, as one investment might have multiple investments.
### 4) We are making this a regression problem, where we are finding out possible links between investors and borrowers using GNN based on the weight parameter, and only selecting the value where the weight exceeds a certain threshold (say 80)

In [1]:
import torch
import numpy as np
from torch.nn import Linear
import torch.nn.functional as F
from graphdatascience import GraphDataScience
import pandas as pd
from sentence_transformers import SentenceTransformer

import torch_geometric.transforms as T
from torch_geometric.nn import SAGEConv, to_hetero

from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected, RandomLinkSplit

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
gds = GraphDataScience("bolt://localhost:7687",auth=("neo4j","password"))

## Projecting the graph

In [5]:
node_projection = ["Investor", "Borrower"]
relationship_projection = {"INVESTED": {"orientation": "UNDIRECTED", "properties": "weight"}}

graph, res = gds.graph.project("recommend",node_projection,relationship_projection)

Loading:   0%|          | 0/100 [00:00<?, ?%/s]

## FastRP is used to convert nodes into embeddings using their topology

## Storing data back to the database (optional)

In [7]:
gds.fastRP.mutate(graph, mutateProperty='embedding', embeddingDimension=256, randomSeed=7474, relationshipWeightProperty='weight');

In [8]:
# Saving these embeddings in the db
# gds.graph.writeNodeProperties(g0, ["property_here"], ["name_of_node_here"])

gds.graph.writeNodeProperties(graph, ["embedding"])

writeMillis                 1716
graphName              recommend
nodeProperties       [embedding]
propertiesWritten          11000
Name: 0, dtype: object

In [7]:
gds.run_cypher('MATCH(n:Borrower) RETURN n.id, n.embedding LIMIT 3')

Unnamed: 0,n.id,n.embedding
0,68355089,"[-0.09213639795780182, 0.11805258691310883, 0...."
1,68341763,"[0.0119507210329175, 0.06319787353277206, -0.0..."
2,66310712,"[0.1054248958826065, 0.11477186530828476, -0.0..."


In [8]:
gds.run_cypher('MATCH(n:Investor) RETURN n.id, n.embedding LIMIT 3')

Unnamed: 0,n.id,n.embedding
0,1,"[-0.08319398015737534, -0.19792011380195618, 0..."
1,2,"[0.09166929125785828, -0.06774382293224335, -0..."
2,3,"[0.09203621745109558, -0.031740203499794006, -..."


## Loading nodes and edges

In [6]:
def load_node(cypher, index_col, encoders=None, **kwargs):
    global gds
    df = gds.run_cypher(cypher)
    df.set_index(index_col, inplace=True)
    
    # Define node mapping
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    # Define node features
    x = None
    if encoders is not None:
        xs = [encoder(df[col]) for col, encoder in encoders.items()]
        x = torch.cat(xs, dim=-1)

    return x, mapping

In [7]:
def load_edge(cypher, src_index_col, src_mapping, dst_index_col, dst_mapping,
                  encoders=None, **kwargs):

    df = gds.run_cypher(cypher)
    # Define edge index
    src = [src_mapping[index] for index in df[src_index_col]]
    dst = [dst_mapping[index] for index in df[dst_index_col]]
    edge_index = torch.tensor([src, dst])
    # Define edge features
    edge_attr = None
    if encoders is not None:
        edge_attrs = [encoder(df[col]) for col, encoder in encoders.items()]
        edge_attr = torch.cat(edge_attrs, dim=-1)

    return edge_index, edge_attr

## We need to encode the features, we are using SentenceTransformer library to do so

In [8]:
class SequenceEncoder(object):
    # The 'SequenceEncoder' encodes raw column strings into embeddings.
    def __init__(self, model_name='all-MiniLM-L6-v2', device=None):
        self.device = device
        self.model = SentenceTransformer(model_name, device=device)

    @torch.no_grad()
    def __call__(self, df):
        x = self.model.encode(df.values, show_progress_bar=True,
                              convert_to_tensor=True, device=self.device)
        return x.cpu()

In [10]:
class IdentityEncoder(object):
    # The 'IdentityEncoder' takes the raw column values and converts them to
    # PyTorch tensors.
    def __init__(self, dtype=None, is_list=False):
        self.dtype = dtype
        self.is_list = is_list

    def __call__(self, df):
        if self.is_list:
            return torch.stack([torch.tensor(el) for el in df.values])
        return torch.from_numpy(df.values).to(self.dtype)

# Now defining Nodes and edges

In [11]:
investor_query = """
MATCH (m:Investor)
RETURN m.id AS investorId, m.inv_address AS investor_address, m.embedding AS embedding,
m.inv_experience AS investor_experience, m.inv_profession AS investor_profession
"""

investor_x, investor_mapping = load_node(
    investor_query, 
    index_col='investorId', encoders={
        'investor_address': SequenceEncoder(),
        'investor_experience': SequenceEncoder(),
        'investor_profession': SequenceEncoder(),
        'embedding': IdentityEncoder(is_list=True)
    })

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:01<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [12]:
borrower_query = """
MATCH (m:Borrower)
RETURN m.id AS borrowerId, m.addr_state AS borrower_address, m.embedding AS embedding,
m.emp_length AS borrower_experience, m.final_profession AS borrower_profession,
m.grade AS borrower_grade
"""

borrower_x, borrower_mapping = load_node(
    borrower_query, 
    index_col='borrowerId', encoders={
        'borrower_address': SequenceEncoder(),
        'borrower_experience': SequenceEncoder(),
        'borrower_profession': SequenceEncoder(),
        'borrower_grade': SequenceEncoder(),
        'embedding': IdentityEncoder(is_list=True)
    })

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

## Now loading invested_weight relationship

In [13]:
rating_query = """
MATCH (i:Investor)-[r:INVESTED]->(b:Borrower) 
RETURN i.id AS investorId, b.id AS borrowerId, r.weight*100 AS weight
"""

edge_index, edge_label = load_edge(
    rating_query,
    src_index_col='investorId',
    src_mapping=investor_mapping,
    dst_index_col='borrowerId',
    dst_mapping=borrower_mapping,
    encoders={'weight': IdentityEncoder(dtype=torch.long)},
)

In [60]:
edge_index

tensor([[   0,    0,    0,  ...,  999,  999,  999],
        [9436, 9238, 9216,  ...,  870,  655,  193]])

In [61]:
edge_label

tensor([20, 73,  3,  ..., 24,  1, 28])

# We have all the required information, now we can create PyG Graph

## In pyg it is necessary to create this mapping because it is 0 indexed, that's why every ID is mapped to corresponding 0 indexes

In [85]:
data = HeteroData()
# Add user node features for message passing:
data['investor'].x = investor_x
data['borrower'].x = borrower_x
data['investor', 'invests', 'borrower'].edge_index = edge_index
data['investor', 'invests', 'borrower'].edge_label = edge_label
data.to(device, non_blocking=True)
print("DONE")

DONE


### If we do not define num_nodes, it will give warning and throw error

The code data = ToUndirected()(data) converts the directed edges in the heterogeneous graph stored in the data object to undirected edges. This means that each edge will now be represented as a single connection between two nodes, without any directionality.

The code del data['borrower', 'rev_invests', 'investor'].edge_label deletes the 'edge_label' property of the edges connecting the 'borrower' and 'investor' nodes in the graph. This likely means that the original edge labels were no longer needed, or that the code no longer requires the information contained in the 'edge_label' property.

Finally, the code data['borrower'].num_nodes = 10000 sets the number of nodes in the 'borrower' node type to 10,000. This could be useful for tasks that require a fixed number of nodes, such as when using a neural network that expects a fixed input size.

## We have a total of 10000 borrowers and 1000 investors

In [86]:
data = ToUndirected()(data)
del data['borrower', 'rev_invests', 'investor'].edge_label  # Remove "reverse" label.
data['borrower'].num_nodes = 10000
data['investor'].num_nodes = 1000

In [87]:
# 2. Perform a link-level split into training, validation, and test edges.
transform = RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('investor', 'invests', 'borrower')],
    rev_edge_types=[('borrower', 'rev_invests', 'investor')],
)
train_data, val_data, test_data = transform(data)

# Data is preprared, now we can train the model

### SAGEConv((-1, -1), hidden_channels) creates a SAGEConv layer with variable-length input size, where the number of input channels is inferred at runtime based on the input data, and the number of output channels is specified by the hidden_channels argument.

In [75]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['investor'][row], z_dict['borrower'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)

class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder1 = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder1, data.metadata())
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [64]:
weight = torch.bincount(train_data['investor','borrower'].edge_label)

weight = weight.max() / weight

def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()

In [89]:
# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

In [95]:
model = Model(hidden_channels=64).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [90]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['investor', 'invests', 'borrower'].edge_label_index)
    target = train_data['investor', 'invests', 'borrower'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)

In [94]:
@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['investor', 'invests', 'borrower'].edge_label_index)
    pred = pred.clamp(min=0, max=100)
    target = data['investor', 'invests', 'borrower'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [96]:
from tqdm import tqdm
EPOCHS = 500
for epoch in tqdm(range(1, EPOCHS),total=len(range(1,EPOCHS))):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    if epoch%100 == 0 or epoch==EPOCHS-1:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

 20%|████████████████▏                                                               | 101/499 [00:16<00:52,  7.56it/s]

Epoch: 100, Loss: 3043.3687, Train: 32.5433, Val: 34.3241, Test: 34.3083


 40%|████████████████████████████████▏                                               | 201/499 [00:29<00:39,  7.55it/s]

Epoch: 200, Loss: 1309.0486, Train: 23.8041, Val: 28.5896, Test: 28.7575


 60%|████████████████████████████████████████████████▎                               | 301/499 [00:42<00:26,  7.56it/s]

Epoch: 300, Loss: 991.1718, Train: 19.8463, Val: 25.4334, Test: 26.0540


 80%|████████████████████████████████████████████████████████████████▎               | 401/499 [00:56<00:13,  7.52it/s]

Epoch: 400, Loss: 846.5872, Train: 21.3018, Val: 27.5374, Test: 28.3031


100%|████████████████████████████████████████████████████████████████████████████████| 499/499 [01:09<00:00,  7.21it/s]

Epoch: 499, Loss: 721.3732, Train: 19.2435, Val: 27.1072, Test: 27.7983





## We are taking 1 investor id and creating fake edges with every borrower that is present (investor id is from the mapping we have, the 0 indexed thing), and we are providing the edge label index that we created (the one with 1 investor and every borrower). The values that we get we are clamping them between 0 and 100 and then creating a mask out of the values and getting all the indexes from our edge index dict. These indexes can now be used to get values from our borrower index mapping

In [101]:
num_borrowers = len(borrower_mapping)
num_investors = len(investor_mapping)

reverse_borrower_mapping = dict(zip(borrower_mapping.values(),borrower_mapping.keys()))
reverse_investor_mapping = dict(zip(investor_mapping.values(),investor_mapping.keys()))

results = []
model.eval()
for investor_id in tqdm(range(0,num_investors),total=len(range(0,num_investors))): 

    row = torch.tensor([investor_id] * num_borrowers)
    col = torch.arange(num_borrowers)
    edge_label_index = torch.stack([row, col], dim=0)

    pred = model(data.x_dict, data.edge_index_dict,
                 edge_label_index)
    
    #any value less than 0 in pred will be set to 0, and any value greater than 100 will be set to 100.
    pred = pred.clamp(min=0, max=100)

    investor_neo4j_id = reverse_investor_mapping[investor_id]
    
    """
    This line of code creates a boolean mask mask for a PyTorch tensor pred, which is True 
    for all elements in pred that are greater than or equal to 80, and False otherwise"""
    
    """
    The nonzero() method then returns a tuple containing a tensor with the indices of the non-zero values,
    """
    mask = (pred >= 80).nonzero(as_tuple=True)

    ten_predictions = [reverse_borrower_mapping[el] for el in  mask[0].tolist()[:10]]
    results.append({'investor': investor_neo4j_id, 'borrower': ten_predictions})

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:32<00:00, 30.54it/s]


## Investor with id=1 would invest in borrowers having id 68341763,66310712 etc

In [102]:
results

[{'investor': 1,
  'borrower': [68341763,
   66310712,
   68466961,
   68495092,
   68607141,
   68533595,
   68407301,
   68615044,
   68387134,
   68584507]},
 {'investor': 2,
  'borrower': [66310712,
   68466961,
   68495092,
   68407301,
   68615044,
   68387134,
   68584507,
   68617057,
   68426699,
   68537564]},
 {'investor': 3,
  'borrower': [66310712,
   68466961,
   68495092,
   68407301,
   68615044,
   68387134,
   68584507,
   68426699,
   68537564,
   68585839]},
 {'investor': 4,
  'borrower': [68341763,
   66310712,
   68466961,
   68495092,
   68607141,
   68341789,
   68533595,
   68407301,
   68615044,
   68387134]},
 {'investor': 5,
  'borrower': [68466961,
   68495092,
   68407301,
   68615044,
   68584507,
   68426699,
   68537564,
   68585839,
   68476511,
   68426681]},
 {'investor': 6,
  'borrower': [68341763,
   66310712,
   68466961,
   68495092,
   68607141,
   68533595,
   68407301,
   68615044,
   68387134,
   68584507]},
 {'investor': 7,
  'borrower': [68