In [None]:
# @author: Núria Queralt Rosinach
# @date: 04-30-2018
# @version: v3

## Prepare Monarch edges to build the graph
* build statements file
* build concepts file

In [None]:
import os, sys
import pandas as pd
import datetime
sys.path.insert(0, '/home/nuria/soft/utils3/lib')
import abravo_lib as utils

In [None]:
# timestamp
today = datetime.date.today()

# path to data
path = os.getcwd() + '/graph'
if not os.path.isdir(path): os.makedirs(path)
    
# read data
sys.path.insert(0, './graph/')

## Input: Monarch statements

In [None]:
# open new_connections file
edges_df = pd.read_table('./graph/monarch_connections_regulation_graph.tsv')
edges_df.head(2)

## Generate statement file

##### NEO4J file features
- Include 'NA'
- one row= one association with all references collapsed in a string as a list
- References as URIs:
    - multi-term pubmed url; pipelined from other urls
    - PubMed: base_uri/{pmid,pmid,pmid,no_pmid}; base_uri: pubmed.nih.gov/
    - Others: expand context to the correct url for not pmids, e.g. react:, ... (first as a list of id)

In [None]:
# add attribute/columns: 'rel_term_id', 'rel_term_label', rel_term_iri to file
# authoritative url
uriPrefixes_dct = {
    'pmid': 'https://www.ncbi.nlm.nih.gov/pubmed/', #'http://identifiers.org/pubmed/', 
    'react': 'http://reactome.org/content/detail/', #'http://identifiers.org/reactome/', 
    'zfin': 'http://zfin.org/',
    'go_ref': 'http://purl.obolibrary.org/obo/go/references/', #'http://identifiers.org/go.ref/GO_REF:', 
    'mgi': 'http://www.informatics.jax.org/accession/MGI:', #'http://identifiers.org/mgi/MGI:'
    'flybase': 'http://flybase.org/reports/',
    'wormbase': 'http://www.wormbase.org/resources/paper/',
    'isbn-13': 'ISBN-13:',
    'hpo': 'http://compbio.charite.de/hpoweb/showterm?id=HP:',
    'isbn-10': 'ISBN-10:'
}
# source/database
dbPrefixes_dct = {
    'na': 'NA',
    'mgi': 'http://www.informatics.jax.org/',
    'fb': 'http://flybase.org/',
    'rgd': 'http://rgd.mcw.edu/', 
    'zfin': 'http://zfin.org/',
    'sgd': 'https://www.yeastgenome.org/'
}
ref_text = 'NA'
ref_date = 'NA' 
with open('./graph/monarch_edges_v{}.tsv'.format(today), 'w') as f:
    f.write(
        'subject_id\tproperty_id\tobject_id\treference_uri\treference_supporting_text\treference_date\tproperty_label\tproperty_description\tproperty_uri\n'
           )
    header = 1
    for row in open('./graph/monarch_connections_regulation_graph.tsv').readlines():
        if header:
            header = 0
            continue
        edge = row.strip('\n').split('\t')
        ref_uri_l = list()
        # expand to uri or NA
        pmid_l = list()
        for ref in edge[-1].strip().split('|'):
            # NA or database
            if ':' not in ref:
                ref_uri = dbPrefixes_dct[ref.lower()]
                ref_uri_l.append(ref_uri)
            # publications
            else:
                pref, uriId = ref.split(':')
                # separate pmid from non pmid
                if ref.startswith('PMID'):
                    pmid_l.append(uriId)
                else:
                    ref_uri = uriPrefixes_dct[pref.lower()] + uriId
                    ref_uri_l.append(ref_uri)
        # create multi-term pubmed url
        if len(pmid_l):
            pmid_s = ','.join(pmid_l)
            ref_uri = uriPrefixes_dct['pmid'] + pmid_s
            ref_uri_l.append(ref_uri)
        ref_uri_list = '|'.join(ref_uri_l)
        # write the associations + list of references as uri or NA
        sub_id = 'NA' if edge[0] is None else edge[0]
        rel_id = 'NA' if edge[2] is None else edge[2] 
        obj_id = 'NA' if edge[4] is None else edge[4]
        rel_label = 'NA' if edge[3] is None else edge[3]
        rel_def = 'NA' #if edge[5] is None else edge[5] 
        if ':' in rel_id:
            rel_iri = 'http://purl.obolibrary.org/obo/' + rel_id.replace(':', '_')
        else:
            rel_iri = rel_id
        f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(sub_id, rel_id, obj_id, ref_uri_list, ref_text, ref_date, rel_label, rel_def, rel_iri))
        #print(sub_id, rel_id, obj_id, ref_uri_list, ref_text, ref_date, rel_label, rel_def, rel_iri)        

## Generate concept file

In [None]:
# semantic groups dictionary
# collide concepts
concept_dct = dict()
header = 1
for row in open('./get-monarch-connections/monarch_connections.tsv').readlines():
    if header:
        header = 0
        continue
    fields = row.strip('\n').split('\t')
    sid = fields[0]
    oid = fields[4]
    concept_dct[sid] = {}
    concept_dct[oid] = {}
print(len(concept_dct.keys()))

# list of concept prefixes
conceptPrefix_dct = dict()
for concept in concept_dct:
    conceptPrefix_dct[concept.split(':')[0]] = 1
print(conceptPrefix_dct.keys())
    
# build conceptPrefix2semantic dict
conceptPrefix2semantic_dct = dict()
for prefix in conceptPrefix_dct:
    prefix = prefix.lower()
    if 'variant' in prefix:
        conceptPrefix2semantic_dct[prefix] = 'VARI'
    elif 'phenotype' in prefix or 'mondo' in prefix or 'omim' in prefix or 'doid' in prefix or 'mesh' in prefix or 'hp' in prefix or 'mp' in prefix or 'fbcv' in prefix or 'fbbt' in prefix or 'zp' in prefix or 'apo' in prefix or 'trait' in prefix:
        conceptPrefix2semantic_dct[prefix] = 'DISO'
    elif 'gene' in prefix or 'hgnc' in prefix or 'ensembl' in prefix or 'mgi' in prefix or 'flybase' in prefix or 'wormbase' in prefix or 'xenbase' in prefix or 'zfin' in prefix or 'rgd' in prefix or 'sgd' in prefix:
        conceptPrefix2semantic_dct[prefix] = 'GENE'
    elif 'react' in prefix or 'kegg-path' in prefix or 'go' in prefix:
        conceptPrefix2semantic_dct[prefix] = 'PHYS'
    elif 'uberon' in prefix or 'cl' in prefix:
        conceptPrefix2semantic_dct[prefix] = 'ANAT'
    elif 'coriell' in prefix or 'monarch' in prefix or 'mmrrc' in prefix or '' in prefix:
        conceptPrefix2semantic_dct[prefix] = 'GENO'
    else:
        conceptPrefix2semantic_dct[prefix] = 'CONC'

In [None]:
# concept attribute dictionaries: id integration of sub and obj IDs in a common data structure
concept_dct = dict()
header = 1
for row in open('./get-monarch-connections/monarch_connections.tsv').readlines():
    if header:
        header = 0
        continue
    fields = row.strip('\n').split('\t')
    # id: integration of sub and obj IDs in a unique data structure
    sid = fields[0]
    slab = fields[1]
    oid = fields[4]
    olab = fields[5]
    # build the concept data structure
    concept_dct[sid] = { 'preflabel': slab, 'semantic_groups': conceptPrefix2semantic_dct.get(sid.split(':')[0].lower(), 'CONC'), 'synonyms': 'NA', 'definition': 'NA' }
    concept_dct[oid] = { 'preflabel': olab, 'semantic_groups': conceptPrefix2semantic_dct.get(oid.split(':')[0].lower(), 'CONC'),'synonyms': 'NA', 'definition': 'NA' }

# check
header = 1
for row in open('./get-monarch-connections/monarch_connections.tsv').readlines():
    if header:
        header = 0
        continue
    fields = row.strip('\n').split('\t')
    sid = fields[0]
    slab = fields[1]
    oid = fields[4]
    olab = fields[5]
    #print(sid,slab,concept_dct.get(fields[0]).get('semantic_groups'))
    #print(oid,olab,concept_dct.get(fields[4]).get('semantic_groups'))

In [None]:
with open('./add-connections-to-net/monarch_nodes_v{}.tsv'.format(today), 'w') as f:
    f.write(
        'id\tsemantic_groups\tpreflabel\tsynonyms\tdescription\n'
           )
    for concept in concept_dct:
        # semantic_groups
        semantic = concept_dct.get(concept).get('semantic_groups')
        # preflabel
        preflabel = concept_dct.get(concept).get('preflabel') 
        # synonyms
        synonyms = concept_dct.get(concept).get('synonyms') 
        # definition
        definition = concept_dct.get(concept).get('definition')
        f.write('{}\t{}\t{}\t{}\t{}\n'.format(concept,semantic,preflabel,synonyms,definition))
        #print(concept,semantic,preflabel,synonyms,definition)