In [1]:
# Welcome to Parallel Compute Hackathon
# =====================================
# This notebook uses TigerGraph python interface to
# calculate similarity using Xilinx FPGA in these steps:
# 1. Load graph
# 2. Create Embeddings
# 3. Send embeddings to FPGA
# 4. Compute Cosine Similarity

In [2]:
# import modules
import sys
import os
from pathlib import Path
import pyTigerGraph as tg
import subprocess as sp
import random as rand
import concurrent.futures as cf
import time
import threading as th

In [3]:
# login Setup
hostName = "xsjfislx14"                             # TG server hostname
userName = "sachink"                                # TG user name
passWord = "Xilinx123"                              # TG user password

# Query variables
populationSize = 1000                               # Size of the total patient population data
topK = 10                                           # Number of highest scoring patient matches
numDevices = 1                                      # Number of FPGA devices to distribute the queries to

# Path Setup
queryLocation = Path("C:/Users/sachink/ghe/graphanalytics/plugin/tigergraph/tests/cosine_nbor_ss_dense_int")
hostDataLocation = f"/proj/gdba/datasets/synthea/{populationSize}_patients/csv"        # Location of synthea generated data

graphName = f'xgraph_{userName}_{populationSize}'   # TG graph name

In [4]:
# Utility Methods

def getPatient(id):
    patientList = conn.getVerticesById('patients', id)
    return [] if len(patientList) == 0 else patientList[0]

def getPatientName(patient):
    return patient['attributes']['FIRST_NAME'] + ' ' + patient['attributes']['LAST_NAME']

def printResults(result, newPatient):
    matches = result[0]['Matches']
    print(f'Matches for patient {getPatientName(newPatient)}')
    for m in matches:
        matchingPatient = getPatient(m['Id'])
        print(f'{m["score"]} {getPatientName(matchingPatient)}')

In [5]:
# 1. Load Graph (only one time required)
# =============

# 1.1 connect to TG server with dummy graph (NOTE: dummy graph should already exist for the connection to work)
conn = tg.TigerGraphConnection(host='http://' + hostName, graphname='xgraph_dummy', username=userName, password=passWord)
print("\n--------- Creating New graph ----------")
print(conn.gsql(f'create graph {graphName}()', options=[]))

# connect to TG server with new graph
print(f'Using graph {graphName}')
conn = tg.TigerGraphConnection(host='http://' + hostName, graphname=graphName, username=userName, password=passWord)

# 1.2 create graph schema
print("\n--------- Creating New Schema ----------")
schemaFile = queryLocation / "schema_xgraph.gsql"
with open(schemaFile) as fh:
    qStrRaw = fh.read()
    qStr = qStrRaw.replace('@graph', graphName)
    print(conn.gsql(qStr, options=[]))
    
# 1.3 laod graph data
print("\n--------- Loading data into graph ----------")
loadFile = queryLocation / "load_xgraph.gsql"
with open(loadFile) as fh:
    qStrRaw = fh.read()
    qStrRaw = qStrRaw.replace('"', '""')
    qStrRaw = qStrRaw.replace('@graph', graphName)
    qStr    = qStrRaw.replace('$sys.data_root', hostDataLocation)
    print(conn.gsql(qStr, options=[]))
    print(conn.gsql(f"RUN LOADING JOB load_xgraph", options=['-g', graphName]))
    print(conn.gsql(f"DROP JOB load_xgraph", options=['-g', graphName]))
    
# 1.4 Install queries
print("\n--------- Installing Queries ----------")
baseQFile = queryLocation / "base.gsql"
clientQFile = queryLocation / "client.gsql"
with open(baseQFile) as bfh, open(clientQFile) as cfh:
    print("installing base queries ...")
    qStrRaw = bfh.read()
    qStrRaw = qStrRaw.replace('"', '""')
    qStr = qStrRaw.replace('@graph', graphName)
    print(conn.gsql(qStr, options=[]))
    print("\ninstalling client queries ...")
    qStrRaw = cfh.read()
    qStrRaw = qStrRaw.replace('"', '""')
    qStr = qStrRaw.replace('@graph', graphName)
    print(conn.gsql(qStr, options=[]))


--------- Creating New graph ----------
Trying version: v3_1_0
Adding gsql-server host xsjfislx14:14240
If there is any relative path, it is relative to <System.AppRoot>/dev/gdk/gsql
The graph xgraph_sachink_1000 is created.

Using graph xgraph_sachink_1000

--------- Creating New Schema ----------
Trying version: v3_1_0
Adding gsql-server host xsjfislx14:14240
If there is any relative path, it is relative to <System.AppRoot>/dev/gdk/gsql
The tuple testResults is added.
Using graph 'xgraph_sachink_1000'
All queries are dropped.
The graph xgraph_sachink_1000 is dropped.
The graph xgraph_sachink_1000 is created.
The job job_schema_change_local is created.

Current graph version 0
Trying to add vertex immunizations.
Trying to add vertex allergies.
Trying to add vertex conditions.
Trying to add vertex imaging_studies.
Trying to add vertex procedures.
Trying to add vertex careplans.
Trying to add vertex patients.
Trying to add edge patient_HAD_immunization.
Trying to add edge patient_HAD_a

In [6]:
# 2. Create Embeddings
# ====================

print('SW vector caching...')
tStart = time.perf_counter()
resultSwCache = conn.runInstalledQuery('client_cosinesim_load_cache', timeout=240000000)
print(f'completed in {time.perf_counter() - tStart:.4f} sec')

SW vector caching...
completed in 0.9269 sec


In [7]:
# 3. Send embeddings to FPGA
# ==========================

print('Hw data load...')
tStart = time.perf_counter()
resultHwLoad = conn.runInstalledQuery('client_cosinesim_load_alveo', {'numDevices': numDevices}, timeout=240000000)
print(f'completed in {time.perf_counter() - tStart:.4f} sec\n')

Hw data load...
completed in 0.3600 sec



In [8]:
# 4. Compute Cosine Similarity
# ============================

print('Running Query...')
targetPatients = conn.getVertices('patients', limit=100)
targetPatient = targetPatients[rand.randint(1,100)]
result = conn.runInstalledQuery('client_cosinesim_match_alveo',
                                  {'newPatient': targetPatient['v_id'], 'topK': topK, 'numDevices': numDevices}, timeout=240000000)
printResults(result, targetPatient)
resTime = result[0]["ExecTimeInMs"]
print(f'completed in {resTime:.2f} msec')

Running Query...
Matches for patient Emory494 Ruecker817
1 Emory494 Ruecker817
0.92693 Oliver401 Hettinger594
0.92667 Jarrett354 Kuhlman484
0.90119 Noel608 Hegmann834
0.89546 Richard937 Lowe577
0.89246 Tamar361 Cremin516
0.89013 Merlin721 Franecki195
0.887 Abdul218 Abshire638
0.88639 Adalberto916 Gaylord332
0.88515 Arlen68 Robel940
completed in 7.08 msec
