In [1]:
# @author: Núria Queralt Rosinach
# @date: 04-26-2018
# @version: v3

# Notebook goal

This notebook is intended to extract connectivity from Monarch for the NGLY1 graph with added regulatory edges (v3).
- seed nodes: NGLY1 network nodes (v3)
- output: /get-monarch-connections/monarch_connections_regulation_graph.tsv 

In [2]:
import os, sys
import requests
import json
import pandas as pd

In [3]:
# path to data
path = os.getcwd() + '/get-monarch-connections'
if not os.path.isdir(path): os.makedirs(path)

# read data
sys.path.insert(0, './get-monarch-connections/')

## Functions

In [4]:
# Get associations

# function get monarch associations
def hitMonarchApi(node = 'HGNC:17646', rows = 2000):
    '''This function performs api calls to Monarch for in and out edges. 
        It returns associated nodes'''
    
    # api address
    biolink = 'https://api.monarchinitiative.org/api/association'
    
    # parameters
    parameters = {'fl_excludes_evidence': False, 'rows': rows}
    
    # out edges: from/
    r_out = requests.get('{}/from/{}'.format(biolink,node),params=parameters)

    # in edges: to/
    r_in = requests.get('{}/to/{}'.format(biolink,node),params=parameters)

    return r_out, r_in 

# prepare monarch api response as a tuple but with the intemediate step to keep all node information
def getEdgesObjects(r_out, r_in):
    '''This function prepare the api object response from monarch to a tuple. 
       It returns three lists, one for subjects, relations, and objects, 
       where each list is a list of dictionaries where each dict is a node.
    '''
    
    # variables
    sub_l = list()
    rel_l = list()
    obj_l = list()
    ref_l = list()

    # compose list of dictionaries
    for associations in [r_out.json()['associations'], r_in.json()['associations']]:
        for association in associations:
            pub_l = list()
            sub_l.append(association['subject'])
            rel_l.append(association['relation'])
            obj_l.append(association['object'])  
            # add references to each association as a list of strings
            if association['publications']:
                for publication in association['publications']:
                    pub_l.append(publication['id'])
            else:
                pub_l.append('NA')
            ref_l.append('|'.join(pub_l))
    
    return sub_l, rel_l, obj_l, ref_l

def getEdges(sub_l, rel_l, obj_l, ref_l, attribute = 'id'):
    '''
       Function that builds triples with an attribute for each node. it returns a tuple with the edges
    '''
    edges = set()
    # compose tuple
    for i in range(len(sub_l)):
        sub = sub_l[i][attribute]
        rel = rel_l[i][attribute]
        obj = obj_l[i][attribute]
        ref = ref_l[i]
        edges.add((sub, rel, obj, ref))
        
    return edges


# add triple
def keepEdges(keep, new):
    '''This function adds triples in a tuple'''
    
    for edge in new:
        keep.add(edge)
        
    return keep


# Keep 1st layer of neighbour nodes
def keepNodes(keep,edges,seed):
    '''
       Function that keeps nodes in the edges
    '''
    
    for (sub, rel, obj, ref) in edges:
        #if ':.' in (sub or obj):
        #    continue
        #if 'Coriell' in (sub or obj):
        #    continue     
        #if 'MMRRC' in (sub or obj):
        #    continue   
        #if 'MONARCH' in (sub or obj):
        #    continue   
        if 'PMID' in sub or 'PMID' in obj:
            continue
        if rel == None:
            rel = 'None'
        if 'dc:source' in rel:
            continue
        if 'IAO:0000136' in rel: # is about
            continue
        if 'IAO:0000142' in rel: # mentions
            continue
        if sub not in seed:
            keep.add(sub)
        if obj not in seed:
            keep.add(obj)
            
    return keep


# gets the first layer of neighbours and relations
def getNeighbours(seedNodes):
    '''
       Function that gets the first layer of neighbours and relations
    '''
    
    keep_nodes = set()
    keep_edges = set()
    for node in seedNodes:
        try:
            r_out, r_in = hitMonarchApi(node)
            sub_l, rel_l, obj_l, ref_l = getEdgesObjects(r_out, r_in)
            edges = getEdges(sub_l, rel_l, obj_l, ref_l, 'id')
            keep_edges = keepEdges(keep_edges,edges)
            keep_nodes = keepNodes(keep_nodes,edges,seedNodes)    
        except json.decoder.JSONDecodeError:
            pass
        except:
            print('error: {}'.format(sys.exc_info()[0]))
            print(node)
                    
    return keep_nodes,keep_edges


# Keep orthologs/phenotypes in the layer of neighbour nodes
def keepNodeType(edges, seed, nodeType = 'ortho'):
    '''
       Function that keeps specific node types in the edges.
       Introduce arg according node type:
           nodeType = 'ortho' to keep orthologs 
           nodeType = 'pheno' to keep phenotypes or diseases
    '''
    
    propertyList = ['RO:HOM0000017', 'RO:HOM0000020']
    if nodeType == 'pheno':
        propertyList = ['RO:0002200', 'RO:0002607', 'RO:0002326', 'GENO:0000840']
        
    keep = set()
    for (sub, rel, obj, ref) in edges:
        if rel == None:
            continue
        if rel in propertyList:
            if sub not in seed:
                keep.add(sub)
            if obj not in seed:
                keep.add(obj)
            
    return keep


# Keep edges where nodes are in the network

# function to evaluate the intersection with network nodes
def filterEdges(nodes, edges):
    '''
       Filters all edges down to those which have both nodes in the nodes set
    '''
    nodes = set(nodes)
    keep = set()
    for (start, pred, stop, ref) in edges:
        if {start, stop} <= nodes:
            keep.add((start, pred, stop, ref))
            
    return keep
        

# add monarch annotation
def addAttributes(sub_l, rel_l, obj_l, edges):
    '''
       This functions adds attributes to each resource in the triple
    '''
    
    metaedges = set()
    for (sub_id, rel_id, obj_id, refs) in edges:
        for i in range(len(sub_l)):
            if sub_l[i]['id'] == sub_id and rel_l[i]['id'] == rel_id and obj_l[i]['id'] == obj_id:
                metaedges.add((sub_l[i]['id'],
                      sub_l[i]['label'],
                      rel_l[i]['id'],
                      rel_l[i]['label'],
                      obj_l[i]['id'],
                      obj_l[i]['label'],
                      refs)
                )
                break
    return metaedges


def printFile(path,fileName,data):
    '''Function to save output to file.'''
    
    with open('{}/{}.tsv'.format(path,fileName), 'w') as f:
        f.write('subject_id\tsubject_label\trelation_id\trelation_label\tobject_id\tobject_label\treference_id_list\n')
        for edge in data:
            edge = ['None' if t is None else t for t in edge]
            f.write('{}\n'.format('\t'.join(edge)))
    
    return print("File '{}/{}.tsv' saved.".format(path,fileName))

## Seed nodes

In [6]:
# get network nodes
print('\nReading network...')
import pandas as pd
path = "/home/nuria/workspace/ngly1-graph/regulation" + "/graph"
nodes_df = pd.read_csv('{}/graph_pre_monarch_connectivity_nodes_v2019-01-17.csv'.format(path))
network = set(list(nodes_df.id))
print(len(network))


Reading network...
9364


## Get connections

In [None]:
keep = set()
for node in network:
    try:
        r_out, r_in = hitMonarchApi(node, 1000)
        sub_l, rel_l, obj_l, ref_l = getEdgesObjects(r_out, r_in)
        edges = getEdges(sub_l, rel_l, obj_l, ref_l, 'id')
        filteredEdges = filterEdges(network, edges)
        metaFilteredEdges = addAttributes(sub_l, rel_l, obj_l, filteredEdges)
        keep = keepEdges(keep, metaFilteredEdges)
    except json.decoder.JSONDecodeError:
            pass
    except:
        print('error: {}'.format(sys.exc_info()[0]))
        print(node)

printFile(path,'monarch_connections_regulation_graph', keep)

error: <class 'KeyboardInterrupt'>
HGNC:9768
