In [1]:
#Import the right libraries
import pandas as pd
import dgl
import numpy as np
import networkx as nx
# from pyvis.network import Network
import torch
# import pygraphviz as pgv
from dgl.data.utils import save_graphs
from dgl.data.utils import load_graphs
import umap.umap_ as umap
import matplotlib.pyplot as plt
import json
import pickle

Using backend: pytorch


In [2]:
# Retrieve model
trained_model = torch.load('model.pth')

In [3]:
# Retrieve graph
g = load_graphs("dgl_graph")
G = g[0][0]

# Start of the github file

In [4]:
import scipy.io
import urllib.request
import dgl
import math
import numpy as np
from hgt_model import *
import argparse

In [5]:
torch.manual_seed(0)

# Retrieve embeddings values
n_epoch = 1
n_hid = 256
n_inp = 256
clip = 1.0
max_lr = 1e-3

In [6]:
def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

In [7]:
def train(model, G):
    best_val_acc = torch.tensor(0)
    best_test_acc = torch.tensor(0)
    train_step = torch.tensor(0)
    for epoch in np.arange(n_epoch) + 1:
        model.train()
        logits, h = model(G, 'investor/company')
        
        company_embeddings = h['company']
        investor_company_embeddings = h['investor/company']

        # The loss is computed only for labeled nodes.
#         loss = F.cross_entropy(logits[train_idx], labels[train_idx].to(device))
        loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        train_step += 1
        scheduler.step(train_step)
        if epoch % 5 == 0:
            model.eval()
            logits, h = model(G, 'investor/company')
            pred   = logits.argmax(1).cpu()
            train_acc = (pred[train_idx] == labels[train_idx]).float().mean()
            val_acc   = (pred[val_idx]   == labels[val_idx]).float().mean()
            test_acc  = (pred[test_idx]  == labels[test_idx]).float().mean()
            if best_val_acc < val_acc:
                best_val_acc = val_acc
                best_test_acc = test_acc
            print('Epoch: %d LR: %.5f Loss %.4f, Train Acc %.4f, Val Acc %.4f (Best %.4f), Test Acc %.4f (Best %.4f)' % (
                epoch,
                optimizer.param_groups[0]['lr'], 
                loss.item(),
                train_acc.item(),
                val_acc.item(),
                best_val_acc.item(),
                test_acc.item(),
                best_test_acc.item(),
            ))
            
    return company_embeddings, investor_company_embeddings

In [8]:
# Our way
different_edge = ('investor/company', 'different_invests_in', 'company')
same_edge = ('investor/company', 'same_invests_in', 'investor/company')

different_labels = G[different_edge].edges()
same_labels = G[same_edge].edges()

pid = G[different_edge].edges()[0]
labels = G[different_edge].edges()[1]

print(pid)
print(labels)

tensor([   0,    0,    0,  ..., 7882, 7882, 7882])
tensor([9181, 9701, 9676,  ..., 7817, 7818, 7819])


In [9]:
# generate train/val/test split
shuffle = np.random.permutation(pid)
train_idx = torch.tensor(shuffle[0:800]).long()
val_idx = torch.tensor(shuffle[800:900]).long()
test_idx = torch.tensor(shuffle[900:]).long()

In [10]:
node_dict = {}
edge_dict = {}

for ntype in G.ntypes:
    node_dict[ntype] = len(node_dict)
for etype in G.etypes:
    edge_dict[etype] = len(edge_dict)
    G.edges[etype].data['id'] = torch.ones(G.number_of_edges(etype), dtype=torch.long) * edge_dict[etype] 

#     Random initialize input feature
for ntype in G.ntypes:
    emb = nn.Parameter(torch.Tensor(G.number_of_nodes(ntype), 256), requires_grad = False)
    nn.init.xavier_uniform_(emb)
    G.nodes[ntype].data['inp'] = emb
    

# G = G.to(device)

In [11]:
model = HGT(G,
            node_dict, edge_dict,
            n_inp=n_inp,
            n_hid=n_hid,
            n_out=labels.max().item()+1,
            n_layers=2,
            n_heads=4,
            use_norm = True)#.to(device)
optimizer = torch.optim.AdamW(model.parameters())

checkpoint = torch.load("model.pth")
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, total_steps=n_epoch, max_lr = max_lr)

print('Training HGT with #param: %d' % (get_n_params(model)))

company_embeddings, investor_company_embeddings = train(model, G)

Training HGT with #param: 3961687




In [12]:
print(company_embeddings.shape)
print(investor_company_embeddings.shape)

torch.Size([9779, 256])
torch.Size([7883, 256])


In [13]:
import sklearn.neighbors
from lshashpy3 import LSHash
from tqdm import tqdm

In [14]:
reducer = umap.UMAP()
company_embeddings_umap = company_embeddings.detach().numpy()
investor_company_embeddings_umap = investor_company_embeddings.detach().numpy()

company_embeddings_umap = reducer.fit_transform(company_embeddings_umap)
investor_company_embeddings_umap = reducer.fit_transform(investor_company_embeddings_umap)

print(company_embeddings_umap)
print(investor_company_embeddings_umap)

[[ 1.6055737  5.243405 ]
 [11.974513   3.8144424]
 [-4.1740546  1.1391181]
 ...
 [ 1.4370649  5.262042 ]
 [11.248003   2.733968 ]
 [ 0.7707422  6.0300183]]
[[13.8582945 21.157558 ]
 [12.297024  22.095522 ]
 [14.996505  22.570297 ]
 ...
 [12.895972  23.39703  ]
 [12.832226  23.528786 ]
 [13.757732  20.32839  ]]


# Apply LSH before UMAP

In [25]:
lsh = LSHash(6, 256)

company_embeddings_list = company_embeddings.detach().numpy()

for com in company_embeddings_list:
    lsh.index(com)

In [26]:
knn_before_umap = []
for i in tqdm(range(0,company_embeddings_list.shape[0])):
    nn = lsh.query(company_embeddings_list[i], num_results=10, distance_func="euclidean")
    for ((vec,_),distance) in nn:
        knn_before_umap.append( (i, np.where(company_embeddings_list == vec)[0][0]) )
        
print(knn_before_umap)

100%|██████████| 9779/9779 [1:00:22<00:00,  2.70it/s]

[(0, 0), (0, 8884), (0, 3929), (0, 3762), (0, 1812), (0, 2206), (0, 1210), (0, 721), (0, 2314), (0, 7737), (1, 1), (1, 6385), (1, 6250), (1, 1639), (1, 1390), (1, 7338), (1, 5106), (1, 7385), (1, 3975), (1, 2123), (2, 2), (2, 6555), (2, 8125), (2, 1843), (2, 7401), (2, 616), (2, 1070), (2, 1175), (2, 9546), (2, 7905), (3, 3), (3, 7269), (3, 669), (3, 885), (3, 6344), (3, 1069), (3, 3318), (3, 2709), (3, 2742), (3, 2778), (4, 4), (4, 897), (4, 1570), (4, 2290), (4, 5007), (4, 3264), (4, 5691), (4, 3122), (4, 6799), (4, 8320), (5, 5), (5, 5331), (5, 1175), (5, 369), (5, 7410), (5, 9453), (5, 2941), (5, 6790), (5, 5007), (5, 2411), (6, 6), (6, 7824), (6, 8879), (6, 9240), (6, 1450), (6, 4096), (6, 5), (6, 5572), (6, 9269), (6, 7786), (7, 7), (7, 5493), (7, 2287), (7, 5512), (7, 6324), (7, 3351), (7, 6680), (7, 7039), (7, 5576), (7, 5301), (8, 8), (8, 2911), (8, 7837), (8, 6057), (8, 1463), (8, 7387), (8, 77), (8, 6876), (8, 2698), (8, 1981), (9, 9), (9, 5802), (9, 3347), (9, 2337), (9, 80




In [27]:
# company_embeddings_list = company_embeddings.detach().numpy()
# neigh1 = sklearn.neighbors.kneighbors_graph(company_embeddings_list, n_neighbors=5, metric='cosine')

In [28]:
# knn_before_umap = []
# for i in tqdm(range(len(neigh1.nonzero()[0]))):
#      knn_before_umap.append( (neigh1.nonzero()[0][i],neigh1.nonzero()[1][i]) )
        
# print(knn_before_umap)

# Apply KNN after UMAP

In [29]:
# neigh2 = sklearn.neighbors.kneighbors_graph(company_embeddings_umap, n_neighbors=5, metric='euclidean')
neigh2 = sklearn.neighbors.kneighbors_graph(company_embeddings_umap, n_neighbors=10, metric='euclidean', include_self=True)

In [30]:
knn_after_umap = []
for i in tqdm(range(len(neigh2.nonzero()[0]))):
     knn_after_umap.append( (neigh2.nonzero()[0][i],neigh2.nonzero()[1][i]) )
        
print(knn_after_umap)

100%|██████████| 97790/97790 [03:09<00:00, 515.87it/s]

[(0, 0), (0, 1797), (0, 1776), (0, 7440), (0, 4825), (0, 7087), (0, 9714), (0, 8252), (0, 5987), (0, 7485), (1, 1), (1, 8867), (1, 4458), (1, 3569), (1, 5236), (1, 2380), (1, 7630), (1, 6909), (1, 8402), (1, 4828), (2, 2), (2, 6555), (2, 6834), (2, 7303), (2, 4615), (2, 1843), (2, 846), (2, 2580), (2, 8552), (2, 9079), (3, 3), (3, 7145), (3, 3336), (3, 7538), (3, 7625), (3, 2791), (3, 4319), (3, 7465), (3, 1261), (3, 5643), (4, 4), (4, 8132), (4, 7018), (4, 1570), (4, 4022), (4, 5438), (4, 8649), (4, 7127), (4, 3494), (4, 3980), (5, 5), (5, 9043), (5, 4628), (5, 2993), (5, 7558), (5, 819), (5, 6940), (5, 5900), (5, 3700), (5, 6447), (6, 6), (6, 4887), (6, 312), (6, 6701), (6, 3049), (6, 7070), (6, 1686), (6, 4072), (6, 3982), (6, 7824), (7, 7), (7, 1030), (7, 5493), (7, 1200), (7, 5392), (7, 293), (7, 9011), (7, 82), (7, 8977), (7, 8263), (8, 8), (8, 5023), (8, 5138), (8, 2911), (8, 8676), (8, 4001), (8, 2731), (8, 4250), (8, 123), (8, 2641), (9, 9), (9, 6662), (9, 9720), (9, 6485), (9




In [31]:
overlap = set(knn_before_umap) & set(knn_after_umap)
result = float(len(overlap)) / len(knn_before_umap) * 100
print(result)

13.51119112893326
