In [None]:
import time
import os
from itertools import product
import math
from collections import Counter

from lark import Lark, Transformer, v_args

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity


import networkx as nx
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from stellargraph.data import UniformRandomWalk, BiasedRandomWalk, UniformRandomMetaPathWalk
from stellargraph import StellarGraph, StellarDiGraph, datasets
from neo4j import GraphDatabase

import matplotlib.pyplot as plt
from IPython.display import display, HTML

Regular expression grammar meant for creating succinct patterns through a graph. The grammar below must contain a NODE Identifier, which should correspond to the node label in the Neo4J database. 

Examples of valid grammar statements:

<i>"chemical_substance treats> disease"</i> -> this would search for any node with the label <b>chemical_substance</b> connected to nodes of type <b>disease</b> with an edge of type <i>treats</i>.

<i>"chemical_substance ? disease"</i> -> this would search for any node with the label <b>chemical_substance</b> connected to nodes of type <b>disease</b> with any edge.


In [None]:
regex_grammar = """
    start: node 
         | node (edge node)+
         | node "(" path ")"

       
    ?path: edge_node
         | path "|" edge_node   -> path_or 
    
    ?edge_node: edge node
         | "(" edge_node+ ")" 
         

         
    ?edge: attm
         | edge "|" attm        -> edge_or
         
    ?attm: EDGE_LABEL           -> edge
         | attm ">"             -> edge_right
         | attm "<"             -> edge_left
         | NULL                 -> edge_no_label
         | "(" edge ")"     
    
    ?node: atom
        | node "|" atom         -> node_or

    ?atom: NODE_LABEL           -> node
         | NODE_LABEL "*"       -> rep_from_0
         | NODE_LABEL "+"       -> rep_from_1
         | NULL                 -> node_no_label
         | "(" node ")"

    EDGE_LABEL: LABEL_STRING
    NODE_LABEL: LABEL_STRING
    NULL: "?"
    LCASE_LETTER: "a".."z"
    UCASE_LETTER: "A".."Z"
    DIGIT: "0".."9"

    LETTER: UCASE_LETTER | LCASE_LETTER | DIGIT | "_" | "-"
    LABEL_STRING: LETTER+ | "_" 

    %import common.CNAME -> NAME
    %import common.WS_INLINE
    %ignore WS_INLINE
"""


Lark parser. Converts a subgraph regular expression into a CYPHER query. These functions act on different triggers provided by the grammar above.

In [None]:
@v_args(inline=True)    # Affects the signatures of the methods
class CalculateTree(Transformer):
    node_idx = 0
    edge_idx = 0
        
    def node(self, name):
        self.node_idx += 1
        return "(n{}:".format(self.node_idx) + str(name) +")"
    
    def edge(self, name):
        self.edge_idx += 1
        return "-[r{}:".format(self.edge_idx) + str(name) +"]-"
    
    def node_no_label(self, name):
        self.node_idx += 1
        return "(n{}".format(self.node_idx) +")"
    
    def edge_no_label(self, name):
        self.edge_idx += 1
        return "-[r{}".format(self.edge_idx) +"]-"
    
    def edge_node(self, name1, name2):
        path = name1 + name2
        return path

    def edge_right(self, name):
        path = ''
        path = name 
        return path + ">"

    def edge_left(self, name):
        path = ''
        path = name 
        return "<" + path

    def rep_from_0(self, name):
        self.node_idx += 1
        path1 = "(n{}:".format(self.node_idx) + str(name) +")"
        path2 = "(n{}:".format(self.node_idx) + str(name) +")" + "--" + "(n{}:".format(self.node_idx+1) + str(name) +")"
        self.node_idx += 1
        return ['', path1, path2]


    def rep_from_1(self, name):
        self.node_idx += 1
        path1 = "(n{}:".format(self.node_idx) + str(name) +")"
        path2 = "(n{}:".format(self.node_idx) + str(name) +")" + "--" + "(n{}:".format(self.node_idx+1) + str(name) +")"
        self.node_idx += 1
        return [path1, path2]

    def node_or(self, name1, name2):
        return [name1, name2]
    
    def edge_or(self, name1, name2):
        return [name1, name2]
    
    def path_or(self, name1, name2):
        return [name1, name2]

In [None]:
def extractPathways(parsed):
    all_elems = []
    mlist = []
    #Walks through the parse tree. If a node may have two or more labels
    # it is added to our collection as a list of all possible labels.
    for child in parsed.children:
        if type(child) == str:
            mlist.append([child])
        else:
            mlist.append(child)
            
    #We iterate through all possible combinations of node and edge labels
    # along the provided regex.
    for i in product(*mlist):
        all_elems.append(list(i))
        
    #Filter null characters out of node labels.
    new_elems = []
    for i in all_elems:
        if('' in i): 
            idx = i.index("")
            i.pop(idx)
            i.pop(idx-1)
            new_elems.append(i)
        else:
            new_elems.append(i)
    return new_elems

def getQueries(source_node_name, regexes):
    all_queries = []
    subgraph_nodes = []
    # parse
    regex_parser = Lark(regex_grammar, parser='lalr',transformer=CalculateTree())
    parsed = regex_parser.parse(regexes)
    all_pathways = extractPathways(parsed)
    
    queryStr = ''
    final_idx = len(all_pathways)-1
    for i, path in enumerate(all_pathways):
        
        if(path[-1]==''):path[-1]
        start_node_num = path[0].split(":")[0].split("(")[1].split(")")[0]
        query = "MATCH p1="
        query += ''.join(str(elem) for elem in path)   
        query += " WHERE %s.name =" % start_node_num
        query += " '%s'" % source_node_name
        
        # add conditions to the node names if repeated types in the path
        add_where = ""
        repeated = []
        for j in path:
            if ":" in j: # if type is given
                if j.split(":")[1] in repeated: # if type is repeated
                    if ")" in j: # if it is a node
                        prev = path[repeated.index(j.split(":")[1])].split(":")[0].split("(")[1]
                        current = j.split(":")[0].split("(")[1]
                        add_where += " AND %s" % prev
                        add_where += " <> %s" % current
                repeated.append(j.split(":")[1])
            else:
                repeated.append("?")                
            
        query += add_where
        query += " WITH collect(p1) as nodez UNWIND nodez as c RETURN c"
        if(i==final_idx): queryStr += query
        else: queryStr += query + " UNION "
    
    return queryStr

In [None]:
def parsing(regexes):
    # parse
    regex_parser = Lark(regex_grammar, parser='lalr',transformer=CalculateTree())
    reg = regex_parser.parse
    parsed = reg(regexes)
    all_elems = extractPathways(parsed)

    
    llink = []
    for i in all_elems:
        Link = []
        for j in i:
            if ':' in j:
                if '(' in j:
                    nodes = j.split(':')[1].split(')')[0]
                    Link.append(nodes)
                if '[' in j:
                    edges = j.split(':')[1].split(']')[0]
                    Link.append(edges)
            else:
                edges = '?'
                Link.append(edges)
        #print(Link)

        for i in range(math.floor(len(Link)/2)):
            #print(i)
            if i == 0:
                llink.append(Link[i:(i+3)])
            else:
                llink.append(Link[(i*2):(i*2)+3])

    return llink


def getSubgraph_neo4j(uri, source_node_name, regexes, compared_labels = None):
    
    queryStr = getQueries(source_node_name, regexes)    
    
    driver = GraphDatabase.driver(uri)
    
    user_labels = []
    for ele in parsing(regexes):
        user_labels += ele
    user_labels = list(set(user_labels))
            
    with driver.session() as session:
        result = session.run(queryStr)
        d = {}
        join_values = []
        for i in result.graph().nodes:
            node_name = i['name']
            if node_name not in join_values:
                #print('labels = ',list(i.labels))
                if len(i.labels)>1:
                    for m in i.labels:
                        if m in user_labels:
                            node_type = m
                        
                        ### for multiple-labeled graph using regex "? ? ?"
                        
                        elif compared_labels != None:
                            if m in compared_labels:
                                node_type = m
                        else:
                            node_type = list(i.labels)[0]
                        ###
                else:
                    node_type = list(i.labels)[0]
                s = d.get(node_type,set())
                s.add(node_name)
                d[node_type] = s
            join_values.append(node_name)

        rels = set()
        for i in result.graph().relationships:
            start = i.start_node["name"]
            end = i.end_node["name"]
            rel_type = i.type
            rels.add((start, end, rel_type))

    raw_nodes = d        
    edges = pd.DataFrame.from_records(list(rels),columns=["source","target","label"])

    data_frames = {}
    for k in d:
        node_names = list(d[k])
        df = pd.DataFrame({"name":node_names}).set_index("name")
        data_frames[k] = df

    sg = StellarDiGraph(data_frames,edges=edges, edge_type_column="label")

    #print(sg.info())    
    return sg 

In [None]:
#Returns counts of all node by labels in graph and all relationships by types.
def infoDict(subG):
    Info = {}
    for i in subG.info().split('\n'):
        if '[' in i:
            temp = i.split(':')
            text = temp[0].strip()
            num = temp[1].split('[')[1].split(']')[0]
            Info[text] = num
        
    return Info

In [None]:
# find union subgraph for two drugs

def querySubgraph(G, regexes, queryStr, compared_labels = None):
    
    uri = G
    driver = GraphDatabase.driver(uri)
    
    user_labels = []
    for ele in parsing(regexes):
        user_labels += ele
    user_labels = list(set(user_labels))
    
    with driver.session() as session:
        result = session.run(queryStr)
        d = {}
        join_values = []
        for i in result.graph().nodes:
            node_name = i['name']
            if node_name not in join_values:
                #print('labels = ',list(i.labels))
                if len(i.labels)>1:
                    for m in i.labels:
                        if m in user_labels:
                            node_type = m
                        
                        ### for multiple-labeled graph using regex "? ? ?"
                        
                        elif compared_labels != None:
                            if m in compared_labels:
                                node_type = m
                        else:
                            node_type = list(i.labels)[0]
                        ###
                else:
                    node_type = list(i.labels)[0]
                s = d.get(node_type,set())
                s.add(node_name)
                d[node_type] = s
            join_values.append(node_name)

        rels = set()
        for i in result.graph().relationships:
            start = i.start_node["name"]
            end = i.end_node["name"]
            rel_type = i.type
            rels.add((start, end, rel_type))

    raw_nodes = d        
    edges = pd.DataFrame.from_records(list(rels),columns=["source","target","label"])

    data_frames = {}
    for k in d:
        node_names = list(d[k])
        df = pd.DataFrame({"name":node_names}).set_index("name")
        data_frames[k] = df

    sg = StellarDiGraph(data_frames,edges=edges, edge_type_column="label")
    
    return sg 

In [None]:
# find semantic ratio for walks
        
def semanticRatio_walks(regexes, Walks, subGs):
    num = 0
    den = 0

    # parse
    llink = parsing(regexes)
    print(llink)

    # matching process
    for i in Walks:     
        # matching nodes
        for j in i:
            res = []
            # find node type for j
            # if two graphs
            if type(subGs) == dict: 

                nodes = []
                for n in subGs.keys():
                    nodes.append(n)
                    
                n1 = str(nodes[0])
                n2 = str(nodes[1])

                if j in subGs[n1].nodes():
                    node_label = subGs[n1].node_type(j)
                elif j in subGs[n2].nodes():
                    node_label = subGs[n2].node_type(j)
                else:
                    print('WEIRD. Node not in subG1 and subG2.')


            # if only one graph        
            else: 
                node_label = subGs.node_type(j)

       
            for l in llink:
                if node_label in l:
                    res.append('Y')
                    break
                else:
                    res.append('N')
            
            # counting how many signals in nodes
            if('Y' in res):
                #print(j, l, 'signal')
                num += 1
            #else:
                #print(j, l, 'noise')
            den += 1
        
        
        # matching edges
        for j in range(len(i)-1):
        
            res = []

            node1 = i[j]
            node2 = i[j+1]
            
            # if two graphs
            if type(subGs) == dict: 
                if (node1, node2) in subGs[n1].edges():
                    loc = subGs[n1].edges().index((node1, node2))
                    edge_label = subGs[n1].edges(' ')[loc][2]
                elif (node2, node1) in subGs[n1].edges():
                    loc = subGs[n1].edges().index((node2, node1))
                    edge_label = subGs[n1].edges(' ')[loc][2]
                elif (node1, node2) in subGs[n2].edges():
                    loc = subGs[n2].edges().index((node1, node2))
                    edge_label = subGs[n2].edges(' ')[loc][2]
                elif (node2, node1) in subGs[n2].edges():
                    loc = subGs[n2].edges().index((node2, node1))
                    edge_label = subGs[n2].edges(' ')[loc][2]
                else:
                    print("WEIRD. Edge not in subG1 and subG2.")
            
            # if one graph
            else:
                if (node1, node2) in subGs.edges():
                    loc = subGs.edges().index((node1, node2))
                    edge_label = subGs.edges(' ')[loc][2]
                elif (node2, node1) in subGs.edges():
                    loc = subGs.edges().index((node2, node1))
                    edge_label = subGs.edges(' ')[loc][2]
            

            for l in llink:
                if edge_label in l:
                    res.append('Y')
                    break
                else:
                    res.append('N')
                           
            if('Y' in res):
                num += 1
            
            den += 1
    
    print(num, den)
    
    return round(num/den,4)

In [None]:
subGs = {}
SRdict = {}
for i in node_list:
    print('building subgraph for ',i)
    subG = getSubgraph_neo4j(G, i, user_input)
    subGs[i] = subG
    SR = semanticRatio(user_input, subG)
    SRdict[i] = SR
subGs
SRdict

# Generate Random Walks 

In [None]:
G = "bolt://robokopkg.renci.org"
user_input = "chemical_substance decreases_activity_of> gene causes> disease"
compared_labels = ['chemical_substance', 'gene', 'disease']



Node_List = [['dexamethasone', 'canagliflozin'],
['afatinib', 'captopril'],

['escitalopram', 'losartan'],
['betamethasone', 'enalapril'],
['dapagliflozin', 'nifedipine'],
['escitalopram', 'felodipine']]
#['simvastatin', 'alendronate sodium trihydrate'],
    
def buildAndWalkSubgraphs(G, node_list, user_input, compared_labels, walk_lengths=[10, 20, 40, 60, 80])
    Results =[]
    subGs = {}
    SRdict = {}
    for node in node_list:
        print('building subgraph for ',node)
        subG = getSubgraph_neo4j(G, node, user_input, compared_labels)
        subGs[node] = subG
        SR = semanticRatio(user_input, subG)
        SRdict[node] = SR

        
    cosineSim = []
    
    for l in walk_lengths
        print('** walk length = ', l, '**')

        Walks = []
        nodeDict = {}
        queryString = ""

        for node in node_list:
            print('===',node, '===')
            start = time.time()
            subG = subGs[node]

            # save the UNION query string
            query = getQueries(node, user_input)
            if(node_list.index(node)==len(node_list)-1): queryString += query
            else: queryString += query + " UNION "

            graph_size = len(list(subG.nodes()))
            adj_l = min(l, graph_size)
            print('walk length = ', adj_l)

            n1 = node

            # DeepWalk
            rw = UniformRandomWalk(subG) #BiasedRandomWalk(G)
            walks = rw.run(
                nodes= [n1],#list(G.nodes()),  # root nodes
                length = l,#adj_wlength,  # maximum length of a random walk
                n = 5 #,  # number of random walks per root node
                #seed = 1
            )
            print("DeepWalk:")
            print("Number of random walks: {}".format(len(walks)))
            print(len(walks[0]))

            # Node2Vec
            # append walks
            for w in walks:
                Walks.append(w)

                for node_id in w:
                    if len(w)==1:
                        nodeDict[node_id] = subG.node_type(n1) 
                    else:
                        nodeDict[node_id] = subG.node_type(node_id) 

            # count the time
            end = time.time()
            print("Time spend: ", end - start)

        # Generate embeddings
        from gensim.models import Word2Vec
        str_walks = [[str(n) for n in walk] for walk in Walks]
        model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=5)


        # Retrieve node embeddings and corresponding subjects
        node_ids = model.wv.index2word  # list of node IDs
        node_embeddings = (
            model.wv.vectors
        )
        # Check similarity from embedding vectors
        n2 = node_list[0]
        n3 = node_list[1]
        print(n2, n3)
        print(node_ids.index(n2))
        print(node_ids.index(n3))
        n2_embeddings = node_embeddings[node_ids.index(n2)]
        n3_embeddings = node_embeddings[node_ids.index(n3)]




        sim = cosine_similarity([n2_embeddings], [n3_embeddings])
        cosineSim.append(sim)


        # semantic ratio for walks
        SR_walks = semanticRatio_walks(user_input, Walks, subGs)
        print(SR_walks)

        # check rank for all nodes
        num = 0
        rank = 0
        for i in model.wv.most_similar(n2,topn=2000):
            num += 1
            if i[0] == n3:
                print(num, i)
                rank = num

        print('rank = ', rank,"/",num)
        prop = round(rank/num, 4)

        # rank include only Compound
        num_ex = 0
        n_all = 0
        rank_ex = 0
        for i in model.wv.most_similar(n2, topn = 2000):
            n_all += 1
            if i[0] in subGs[n2].nodes():
                nodeType = subGs[n2].node_type(i[0])
            else:
                nodeType = subGs[n3].node_type(i[0])

            if nodeType == subGs[n2].node_type(n2):
                num_ex += 1
                if i[0] == n3:
                    print(num_ex, i)
                    rank_ex = num_ex

        print('rank exclude not Compound = ', rank_ex, '/', num_ex)
        print('number of drugs', num_ex)
        print('number of nodes', n_all)

        print(node_list, SRdict, l, sim.tolist()[0][0], rank, num, prop, rank_ex, num_ex, SR_walks)
        result_row = [node_list, SRdict, l, sim.tolist()[0][0], rank, num, prop, rank_ex, num_ex, SR_walks]
        Results.append(result_row)
    return Results

In [None]:
df = pd.DataFrame(Results, columns=['drug_pairs', 'SR_subgraphs', 'walk_length', 'cos_sim', 'rank', 'num_nodes', 'prop', 'rank_ex', 'num_nodes_ex', 'SR_walks'])
df.to_csv('robokop_neg_semantic.csv', index=False)
df

In [None]:
### Check common nodes in subgraphs


#node_list = ['Canagliflozin', 'Dapagliflozin'] #0.31
#node_list = ['Dexamethasone', 'Betamethasone'] #0.479
#node_list = ['Lapatinib', 'Afatinib'] #0.156
#node_list = ['Captopril', 'Enalapril'] #0.346
#node_list = ['Losartan', 'Valsartan'] #0.322
#node_list = ['Nifedipine', 'Felodipine'] #0.305
#node_list = ['Simvastatin', 'Atorvastatin'] #0.216
#node_list = ['Alendronate', 'Incadronate'] #0.041 @@
#node_list = ['Citalopram', 'Escitalopram'] #0.412

# negative
#node_list =['Dexamethasone', 'Voglibose'] # 0.006
#node_list =['Lapatinib', 'Voglibose'] # 0

#node_list =['Dexamethasone', 'Canagliflozin'] # 0.118
#node_list = ['Afatinib', 'Captopril'] # 0.168
#node_list = ['Simvastatin', 'Alendronate'] # 0.104
#node_list = ['Escitalopram', 'Losartan'] # 0.175
#node_list = ['Betamethasone', 'Enalapril'] # 0.21
#node_list = ['Dapagliflozin', 'Nifedipine'] # 0.294
#node_list = ['Atorvastatin', 'Incadronate'] # 0.012
node_list = ['Citalopram', 'Felodipine'] # 0.284



#'Methylphenobarbital', 'Talbutal', 'Amobarbital', 'Etidronic acid'
list_compare = []
for i in node_list:
    
    subG = getSubgraph(G, i, user_input)
    list_subG = list(subG.nodes())
    print('# total nodes = ', len(list_subG))
    list_compare.append(list_subG)

list_u = set(list_compare[0]).intersection(set(list_compare[1]))
print(len(list_u))
print(len(list_compare[0])+len(list_compare[1]))
ratio = len(list_u)/(len(list_compare[0])+len(list_compare[1]))
print(ratio)

# Non-Semantic

In [None]:
#user_input = "Compound ? (Gene|Disease) ASSOCIATES_DaG< Disease"
Node_List = [['Dexamethasone', 'Betamethasone'],['Lapatinib', 'Afatinib'],
            ['Captopril', 'Enalapril'],['Losartan', 'Valsartan'],['Nifedipine', 'Felodipine'],
            ['Simvastatin', 'Atorvastatin'],['Citalopram', 'Escitalopram']]

#['Canagliflozin', 'Dapagliflozin'],['Alendronate', 'Incadronate'],
# Node_List = [
#             ['Afatinib', 'Captopril'],['Simvastatin', 'Alendronate'],['Escitalopram', 'Losartan'],
#             ['Betamethasone', 'Enalapril'],['Dapagliflozin', 'Nifedipine'],
#             ['Citalopram', 'Felodipine']]

#['Dexamethasone', 'Voglibose'],['Lapatinib', 'Voglibose'],['Atorvastatin', 'Incadronate'],['Dexamethasone', 'Canagliflozin'],

# Node_List = [['dexamethasone', 'betamethasone'],
#  ['canagliflozin', 'dapagliflozin'],
#  ['lapatinib', 'afatinib'],
#  ['captopril', 'enalapril'],
#  ['losartan', 'valsartan'],
#  ['nifedipine', 'felodipine'],
#  ['simvastatin', 'atorvastatin'],
# ['fluconazole', 'voriconazole']]

In [None]:
compared_labels

In [None]:
Results = []
walk_length = [10, 20, 40, 60, 80]

for node_list in Node_List:
    subGs = {}
    SRdict = {}
    for i in node_list:
        print('building subgraph for ',i)
        query = "Match (n0)-[r1]->(n1)-[r2]->(n2) Where n0.name = '%s' AND NOT n1:gene AND NOT n2:gene Return * limit 1000"% i
        
        #query = "Match (n0)-[r1]->(n1)-[r2]->(n2) Where n0.name = '%s' Return * limit 1000"% i
        print(query)
        subG = querySubgraph(G, user_input, query, compared_labels)#getSubgraph_neo4j(G, i, user_input)
        subGs[i] = subG
        SR = semamticRatio(user_input, subG)
        SRdict[i] = SR

    cosineSim = []
    for l in walk_length:
        print('** walk length = ', l, '**')
        Walks = []
        nodeDict = {}
        for i in node_list:
            print('===',i, '===')
            subG = subGs[i]
            
            # appropriate size of walk length (Mar 14)
            graph_size = len(list(subG.nodes()))
            adj_l = min(l, graph_size)
            print('walk length = ', adj_l)


            # appropriate size of walk length (Mar 14)
            #graph_size = len(list(subG.nodes()))
            #adj_wlength = min(l, round(2 * graph_size))


            # DeepWalk
            rw = UniformRandomWalk(subG) #BiasedRandomWalk(G)
            walks = rw.run(
                nodes= [i], #list(G.nodes()),  # root nodes
                length = l,#adj_wlength,  # maximum length of a random walk
                n = 5 #,  # number of random walks per root node
                #seed = 1
            )
            print("DeepWalk:")
            print("Number of random walks: {}".format(len(walks)))
            #print(len(walks[0]))

            # Node2Vec
    #         rw = BiasedRandomWalk(subG)
    #         walks = rw.run(
    #             nodes= [i],  # root nodes
    #             length = adj_l,  # maximum length of a random walk
    #             n = 5,  # number of random walks per root node
    #             p = 2,  # Defines (unormalised) probability, 1/p, of returning to source node
    #             q = 0.5#,  # Defines (unormalised) probability, 1/q, for moving away from source node
    #             #seed = 5
    #         )

    #         print("DeepWalk:")
    #         print("Number of random walks: {}".format(len(walks)))
    #         print(len(walks[0]))
    #         #print(walks)


    #         # Metapath2vec
    #         metapath = [['Compound','Gene','Disease', 'Gene', 'Compound']]#,['Compound','Disease','Compound']]
    #         rw = UniformRandomMetaPathWalk(subG)

    #         walks = rw.run(
    #             nodes= [i],#list(G.nodes()),  # root nodes
    #             length = l,  # maximum length of a random walk
    #             n = 5,  # number of random walks per root node
    #             metapaths = metapath#,
    #             #seed = 5
    #         )
    #         print("Metapath2vec:")
    #         print("Number of random walks: {}".format(len(walks)))
    #         print(len(walks[0]))
    #         #print(walks)


            # append walks
            for w in walks:
                Walks.append(w)

                for node_id in w:
                    if len(w)==1:
                        nodeDict[node_id] = subG.node_type(n1) 
                    else:
                        nodeDict[node_id] = subG.node_type(node_id) 


        # Generate embeddings

        str_walks = [[str(n) for n in walk] for walk in Walks]
        model = Word2Vec(str_walks, size=128, window=5, min_count=0, sg=1, workers=2, iter=5)


        # Retrieve node embeddings and corresponding subjects
        node_ids = model.wv.index2word  # list of node IDs
        node_embeddings = (
            model.wv.vectors
        )
        # Check similarity from embedding vectors
        n2 = node_list[0]
        n3 = node_list[1]
        print(n2, n3)
        print(node_ids.index(n2))
        print(node_ids.index(n3))
        n2_embeddings = node_embeddings[node_ids.index(n2)]
        n3_embeddings = node_embeddings[node_ids.index(n3)]

        sim = cosine_similarity([n2_embeddings], [n3_embeddings])
        cosineSim.append(sim)


        # semantic ratio for walks
        SR_walks = semanticRatio_walks(user_input, Walks, subGs)
        print(SR_walks)

        # check rank for all nodes
        num = 0
        rank = 0
        for i in model.wv.most_similar(n2,topn=2000):
            num += 1
            if i[0] == n3:
                print(num, i)
                rank = num

        print('rank = ', rank,"/",num)
        prop = round(rank/num, 4)

        # rank include only Compound
        num_ex = 0
        n_all = 0
        rank_ex = 0
        for i in model.wv.most_similar(n2, topn = 2000):
            n_all += 1
            #print(i)
            if i[0] in subGs[n2].nodes():
                #print('in n2')
                nodeType = subGs[n2].node_type(i[0])
            elif i[0] in subGs[n3].nodes():
                #print('in n3')
                nodeType = subGs[n3].node_type(i[0])
            else:
                print('Not in n2 and n3')
                nodeType = 'none'

            if nodeType == subGs[n2].node_type(n2):
                num_ex += 1
                if i[0] == n3:
                    print(num_ex, i)
                    rank_ex = num_ex

        print('rank exclude not Compound = ', rank_ex, '/', num_ex)
        print('number of drugs', num_ex)
        print('number of nodes', n_all)

        print(node_list, SRdict, l, sim.tolist()[0][0], rank, num, prop, rank_ex, num_ex, SR_walks)
        result_row = [node_list, SRdict, l, sim.tolist()[0][0], rank, num, prop, rank_ex, num_ex, SR_walks]
        Results.append(result_row)

Results

In [None]:
df = pd.DataFrame(Results, columns=['drug_pairs', 'SR_subgraphs', 'walk_length', 'cos_sim', 'rank', 'num_nodes', 'prop', 'rank_ex', 'num_nodes_ex', 'SR_walks'])
df.to_csv('robokop_neg_nonsemantic_lower.csv', index=False)
df

In [None]:
# semantic ratio for walks (not include edges)
semanticRatio_walks(user_input, Walks, subGs)

In [None]:

print(Counter(nodeDict.values()))

In [None]:
subG.node_type(n3)

In [None]:
# positive pairs
#node_list = ['Canagliflozin', 'Dapagliflozin'] #0.271
#node_list = ['Dexamethasone', 'Betamethasone'] #0.0981
#node_list = ['Lapatinib', 'Afatinib'] #0.0903
#node_list = ['Captopril', 'Enalapril'] #0.1106
node_list = ['Losartan', 'Valsartan'] #?
#node_list = ['Nifedipine', 'Felodipine'] #0.0628
#node_list = ['Simvastatin', 'Atorvastatin'] #0.0467
#node_list = ['Alendronate', 'Incadronate'] #0.1075
#node_list = ['Citalopram', 'Escitalopram'] #0.3623

# negative pairs
#node_list =['Dexamethasone', 'Voglibose'] # 0.0508
#node_list =['Lapatinib', 'Voglibose'] # 0.0447

#node_list =['Dexamethasone', 'Canagliflozin'] # 0.066
#node_list = ['Afatinib', 'Captopril'] # 0.104
#node_list = ['Simvastatin', 'Alendronate'] # 0.105
#node_list = ['Escitalopram', 'Losartan'] # 0.0807
#node_list = ['Betamethasone', 'Enalapril'] # 0.078
#node_list = ['Dapagliflozin', 'Nifedipine'] # 0.0775
#node_list = ['Atorvastatin', 'Incadronate'] # 0.0194
#node_list = ['Citalopram', 'Felodipine'] # 0.0913


list_compare = []
for i in node_list:
    #list_subG = []
    
    #query = "Match (n0)-[r1]->(n1)-[r2]->(n2)-[r3]-(n3) Where n0.name = '%s' Return * limit 200"% i
    query = "Match (n0)-[r1]->(n1)-[r2]->(n2) Where n0.name = '%s' Return * limit 1000"% i
    
    print(query)
    subG = getSubgraph_non(G, i, query)
    
    
    list_subG = list(subG.nodes())
    print('# total nodes = ', len(list_subG))
    list_compare.append(list_subG)


list_u = set(list_compare[0]).intersection(set(list_compare[1]))
print('# common nodes = ',len(list_u))
print(len(list_compare[0])+len(list_compare[1]))
ratio = len(list_u)/(len(list_compare[0])+len(list_compare[1]))
print(ratio)

<h1>Examples</h1>

In [None]:
start = time.time()
compound = 'Simvastatin'
example_expression = "Compound ? (Gene|Disease) ASSOCIATES_DaG< Disease"
sg = getSubgraph_neo4j('bolt://neo4j.het.io/', compound, example_expression)
end = time.time()
print(sg.info())
print("Time spent: ", end - start)

In [None]:
infoDict(sg)