# Querying WikiData for henet edges

In [1]:
import json
import pandas as pd
from pathlib import Path
from datetime import datetime
from tqdm import tqdm_notebook
import wdhetnetbuilder as wdh

In [2]:
net_info_dir = Path('../0_data/manual').resolve()
h = wdh.WDHetnetQueryBuilder(net_info_dir.joinpath('node_info.json'),
                             net_info_dir.joinpath('edge_info.json'))

## Defining the strucure of the metagraph 

In [3]:
hetnet_edges = [
    {'abbrev': 'CdiC'},
    {'abbrev': 'CtD'},
    #{'abbrev': 'PPaiC'},
    {'abbrev': 'CHhcC'},
    {'abbrev': 'PWhpC'},
    {'abbrev': 'CpP'},
    {'abbrev': 'PiwC'},
    {'abbrev': 'VntC'},
    {'abbrev': 'VptC'},
    {'abbrev': 'DaP', 'target': 'Gene'},
    {'abbrev': 'DaG'},
    {'abbrev': 'DsyS'},
    {'abbrev': 'DmsMS'},
    {'abbrev': 'CHsyS'},
    {'abbrev': 'CHsyD'},
    {'abbrev': 'VndD'},
    {'abbrev': 'VpdD'},
    {'abbrev': 'VvP', 'target': 'Gene'},
    {'abbrev': 'VvG'},
    {'abbrev': 'PWhpP', 'target': 'Gene'},
    {'abbrev': 'PWhpG'},
    {'abbrev': 'PccCC'},
    {'abbrev': 'PbpBP'},
    {'abbrev': 'PmfMF'},
    {'abbrev': 'PhpPD'},
    {'abbrev': 'PhpSS'},
    {'abbrev': 'PFhpP'},
    {'abbrev': 'PhpBS'},
    {'abbrev': 'PhpAS'},
    {'abbrev': 'PhpSM'},
    #{'abbrev': 'PPtaD'},
    {'abbrev': 'CrCR'},
    {'abbrev': 'DlA'},
    {'abbrev': 'CHafA'},
    {'abbrev': 'CtCH'},
    {'abbrev': 'BPhpC'},
    {'abbrev': 'PccA'},
    {'abbrev': 'PWhpBP'},
    {'abbrev': 'PFhpBS'},
    {'abbrev': 'PDhpSS'},
    {'abbrev': 'PFhpSS'},
    {'abbrev': 'PWhpBP'},
    {'abbrev': 'PFhpPD'},
    {'abbrev': 'PFhpAS'},
    {'abbrev': 'PregBP'}
]

In [4]:
queries = [h.build_query_from_abbrev(**edge) for edge in hetnet_edges]

An error was found in the Feburary 2018 Data Dump... The majority of Biological Process nodes are missing their `instance_of Biological Process` statment (`wdt:P31 'wd:Q996394`), leading to severely decreased number of edges with these node types.  

Because biological processes are also defined by the property `Biological Process` (`wdt:P686`) we can use this as well as a check for a GO Term Identifier to recover these edges.

In [5]:
ini_queries_2_2018 = [h.build_query_from_abbrev(**edge) for edge in hetnet_edges]

# Biological Process nodes forwhatever reason lost their wdt:P31 wd:Q2996394 statments in 2018 for whatever reason
# so instead still use the biological process proterty (wdt:P682) beteen the protien and bp 
# and check to make sure they have a go id... (wdt:P686)
queries_2_2018 = []
for q in ini_queries_2_2018:
    queries_2_2018.append(q.replace("""    ?biological_process wdt:P31 wd:Q2996394 .""", 
                                    """    ?biological_process wdt:P686 ?go_id .""")
                           .replace("""    ?biological_process1 wdt:P31 wd:Q2996394 .""",
                                    """    ?biological_process1 wdt:P686 ?go_id1 .""")
                           .replace("""    ?biological_process2 wdt:P31 wd:Q2996394 .""", 
                                    """    ?biological_process2 wdt:P686 ?go_id2 ."""))

A similar problem was found back in early 2017: Genes and proteins were `subclass of` Gene or Protein... not `instance of`...  Disease was a mess, with some `subclass of` some `instance of` and some both... fixing these for our 2017 queries

In [6]:
# Fix gene and protein
h.node_info['Gene']['subclass'] = True
h.node_info['Protein']['subclass'] = True

# Update the class with the new info 
# TODO: Add an update node method that re-runs this auto-magically...
h.subclass = h._extract_node_key('subclass')
h.extend = h._extract_node_key('extend')

ini_queries_2017 = [h.build_query_from_abbrev(**edge) for edge in hetnet_edges]

# Disease are sometimes 'instance_of', sometimes 'subclass_of', so we will ectend both...
queries_2017 = []
for q in ini_queries_2017:
    queries_2017.append(q.replace("""    # Initial typing for Disease
    ?disease wdt:P31 wd:Q12136 .""", """    # Initial typing for Disease
    ?disease wdt:P31|wdt:P279* wd:Q12136 ."""))

In [7]:
print(h.build_query_from_abbrev('CtD'))

SELECT DISTINCT ?compound ?compoundLabel ?disease ?diseaseLabel 
WHERE {

    # Initial typing for Compound
    ?compound wdt:P31 wd:Q11173 .

    # Initial typing for Disease
    ?disease wdt:P31 wd:Q12136 .

    { ?compound wdt:P2175/^wdt:P279? ?disease }
    UNION { ?disease wdt:P279?/wdt:P2176 ?compound }

    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
}


In [8]:
endpoints = {
    'https://query.wikidata.org/sparql': datetime.today().strftime('%Y-%m-%d'),
    'http://avalanche.scripps.edu:9988/bigdata/sparql': '2018-11-12',
    'http://avalanche.scripps.edu:9999/bigdata/sparql': '2018-02-05',
    'http://kylo.scripps.edu:9988/bigdata/sparql': '2017-01-16',
}

In [9]:
results = dict()

# Sort so live wikidata is done last incase of errors on local instances...
for ep, dump_date in tqdm_notebook(sorted(endpoints.items()), desc='All Endpoints'):

    # Get the correct set of queries for the correct years...
    if dump_date.startswith('2017'):
        to_query = queries_2017
    elif dump_date.startswith('2018-02'):
        to_query = queries_2_2018
    else:
        to_query = queries
    
    cur_res = dict()
    for meta_edge, query in tqdm_notebook(zip(hetnet_edges, to_query),
                                            desc=dump_date+' Data',
                                            total=len(hetnet_edges)):

        cur_res[meta_edge['abbrev']] = wdh.execute_sparql_query(query, endpoint=ep)

    results[dump_date] = cur_res

HBox(children=(IntProgress(value=0, description='All Endpoints', max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, description='2018-11-12 Data', max=43), HTML(value='')))

HBox(children=(IntProgress(value=0, description='2018-02-05 Data', max=43), HTML(value='')))

HBox(children=(IntProgress(value=0, description='2017-01-16 Data', max=43), HTML(value='')))

HBox(children=(IntProgress(value=0, description='2019-09-13 Data', max=43), HTML(value='')))




In [10]:
edge_count = []
for date, res in results.items():
    counts = pd.Series({name: len(res[name]) for name in res}, name=date)
    edge_count.append(counts)
edge_count = pd.concat(edge_count, axis=1)
edge_count

Unnamed: 0,2018-11-12,2018-02-05,2017-01-16,2019-09-13
CdiC,1807,1807,1798,1720
CtD,35204,16508,21679,37160
CHhcC,576,582,604,571
PWhpC,3316,2896,0,3637
CpP,3874,3857,3509,3841
PiwC,3634,3619,3659,3607
VntC,447,145,90,602
VptC,633,532,134,793
DaP,2130,2140,2964,9368
DaG,31,36,28,34


In [11]:
this_name = '01_querying_wikidata_for_hetnet_edges'
out_dir = Path('../2_pipeline').resolve().joinpath(this_name, 'out')

out_dir.mkdir(parents=True, exist_ok=True)

edge_count.to_csv(out_dir.joinpath('edge_counts.csv'))

### Some Error Fixing

1. If start and end nodetypes are the same, could potentiall have node_id1 -> node_id2 and node_id2 -> node_id1... This is only useful if the edge is directed, but most of these edges are bi-directional (undirected) so only one of the directions is needed.

2. Since WikiData can have more than one 'instance_of' statment per node, some nodes may be members of mulitple types... will look at those queried and see where they are.

3. Qualified statments need further processing, so we will collect those

4. Multi-step edges that will be compresssed to 1 edge need further processing, so we will collect those

In [12]:
def remove_query_numb(query_name):
    numb = wdh.get_query_numb(query_name)
    if numb:
        idx = query_name.index(numb)
        return query_name[:idx]
    else:
        return query_name
    
def to_full_name(query_name):
    name = remove_query_numb(query_name)
    return name.replace('_', ' ').title()

In [13]:
def process_query_res(q_result):

    node_ids = dict()
    id_to_name = dict()
    self_ref = set()
    qualified = set()
    multi_step = set()

    # Do some processing on the collected edges
    for e, r in q_result.items():

        s_kind, e_type, e_kind = wdh.gt.parse_edge_abbrev(e)
        all_n_types = [c for c in r.columns if not c.endswith('Label')]


        for nt in all_n_types:
            # Get the node type by removing any trailing numbers
            numb = wdh.get_query_numb(nt)
            if numb:
                idx = nt.index(numb)
                node_type = nt[:idx]
            else:
                node_type = nt

            # For a given node type, collect all the ids... don't need qualifiers
            if node_type != 'qualifier': 
                if node_type in node_ids:
                    node_ids[node_type].update(set(r[nt]))
                else:
                    node_ids[node_type] = set(r[nt])
                id_to_name.update(r.set_index(nt)[nt+'Label'].to_dict())

            # Identifiy self_reffrenetial edges
            if s_kind == e_kind:
                self_ref.add(e)

            if len(all_n_types) > 2:
                # Grab qualified edges for further processing
                if 'qualifier' in all_n_types:
                    qualified.add(e)
                # Currently, an edge can not be both multi-step and qualified
                else:
                    multi_step.add(e)
                    
    return node_ids, id_to_name, self_ref, qualified, multi_step


In [14]:
def fix_self_ref_edges(q_result, self_ref, id_to_name):
    fixed = dict()

    for kind in tqdm_notebook(self_ref):

        # no need to worry about forward vs reverse in directed edges
        if '>' in kind or '<' in kind:
            continue

        # Only look at 1 kind of edge at a time
        this_edges = q_result[kind]
        col_names = this_edges.columns

        edge_ids = set()

        for row in this_edges.itertuples():
            # Grab the edge ID, sorting, so lowest ID first:
            #     If both 'Q00001 -- Q00002' and 'Q00002 -- Q00001' exist, effectively standarizes to 
            #     'Q00001 -- Q00002'
            edge_id = tuple(sorted([row[1], row[3]]))
            edge_ids.add(edge_id)

        start_ids = []
        start_names = []
        end_ids = []
        end_names = []


        for edge_id in edge_ids:
            start_ids.append(edge_id[0])
            start_names.append(id_to_name[edge_id[0]])
            end_ids.append(edge_id[1])
            end_names.append(id_to_name[edge_id[1]])

        fixed[kind] = pd.DataFrame({col_names[0]: start_ids, col_names[1]: start_names, col_names[2]: end_ids, col_names[3]: end_names})
        
    return fixed

In [15]:
def find_func_numb(node_names, name, func):
    return func([wdh.get_query_numb(n) for n in node_names if n.startswith(name)])

def find_max_numb(node_names, name):
    return find_func_numb(node_names, name, max)

def find_min_numb(node_names, name):
    return find_func_numb(node_names, name, min)

In [16]:
def find_correct_node_name(node_names, name, func):
    for node in node_names:
        numb = wdh.get_query_numb(node)
        if node.startswith(name) and node != name and numb:
            return name + str(func(node_names, name))
    return name

In [17]:
def get_start_and_end_names(node_names, s_type, e_type):

    s_name = find_correct_node_name(node_names, s_type, find_min_numb)
    e_name = find_correct_node_name(node_names, e_type, find_max_numb)
    
    return s_name, e_name

In [18]:
def process_multi_step_edges(q_result, qualified, multi_step):
    fixed = dict()
    
    # Essentially just change the column order for later processing...
    for kind in tqdm_notebook(multi_step.union(qualified)):

        # Get the information for the current edge
        this_edges = q_result[kind]
        col_names = this_edges.columns
        node_cols = [c for c in col_names if not c.endswith('Label')]

        # Need to know what start and end types we're looking for
        s_kind, e_type, e_kind = wdh.gt.parse_edge_abbrev(kind)
        s_name = wdh.to_query_name(h.node_abv_to_full[s_kind])[1:]
        e_name = wdh.to_query_name(h.node_abv_to_full[e_kind])[1:]

        if 'qualifier' not in node_cols:
            s_name, e_name = get_start_and_end_names(node_cols, s_name, e_name)

        new_node_order = [s_name, e_name] 
        new_node_order += [n for n in node_cols if n not in new_node_order]

        new_col_names = []
        for n in new_node_order:
            new_col_names += [n, n+'Label']

        fixed[kind] = this_edges[new_col_names].copy()
        
    return fixed

## Hetnet To Nodes

In [19]:
def build_hetnet_nodes(node_ids, id_to_name):

    nodes = []
    for k, v in node_ids.items():
        curr_nodes = pd.DataFrame({'id': list(v), 'label': len(v)*[k]})
        curr_nodes['name'] = curr_nodes['id'].map(id_to_name)
        nodes.append(curr_nodes)
        
    # Make dataframe
    nodes = pd.concat(nodes).reset_index(drop=True)
    
    # Fix labels (from lowercase_underscore to As Defined in node_info.json)
    label_map = {wdh.to_query_name(k)[1:]: k for k in h.node_info.keys()}    
    nodes['label'] = nodes['label'].map(label_map)
        
    return nodes

## To Hetnet Edges

In [20]:
def process_PregBP(edges):
    edges_out = edges.copy()

    keep_map = {'positive regulation': 'UP_REGULATES_GuBP',
                'negative regulation': 'DOWN_REGULATES_GdBP',
                'regulation': 'REGULATES_GregBP'}
    
    direction = edges['biological_process1Label'].str.split(' of ', expand=True)[0]
    edges_out['type'] = direction.map(keep_map)
    
    return edges_out.dropna(subset=['type']).reset_index(drop=True)

In [21]:
def process_CpP(edges):
    edges_out = edges.copy()
    
    type_map = {'receptor antagonist': 'INHIBITS_CiG',
                'enzyme inhibitor': 'INHIBITS_CiG',
                'agonist': 'ACTIVATES_CacG',
                'channel blocker': 'INHIBITS_CiG',
                'substrate': 'BINDS_CbG',
                'allosteric modulator': 'BINDS_CbG',
                'channel activator activity': 'ACTIVATES_CacG',
                'protein-protein interaction inhibitor': 'INHIBITS_CiG',
                'ligand in biochemistry': 'BINDS_CbG',
                'reuptake inhibitor': 'INHIBITS_CiG',
                'neutralizing antibody': 'INHIBITS_CiG'}
    
    edges_out['type'] = edges_out['qualifierLabel'].str.lower().map(type_map)
    return edges_out

In [22]:
def build_hetnet_edges(q_result, fixed_edges):

    edges = []

    for k, v in q_result.items():
        if k in fixed_edges.keys():
            v = fixed_edges[k]


        col_names = v.columns
        keep_cols = [c for c in col_names if not c.endswith('Label')]
        
        # Queries sometimes return zero results, so skip those...
        if not keep_cols:
            continue
        
        col_name_map = {keep_cols[0]: 'start_id', keep_cols[1]: 'end_id'}

        # Inner nodes in multi-step edges become inner1, inner2, etc...
        inner_cols = {k: 'inner'+str(idx+1) for idx, k in enumerate(keep_cols[2:]) if k != 'qualifier'}
        col_name_map = {**inner_cols, **col_name_map}

        v = v.rename(columns=col_name_map)

        if k == "PregBP":
            v = process_PregBP(v)
        elif k == "CpP":
            v = process_CpP(v)

        # Replace Proteins with Genes, to merge the protein and gene metanodes
        parsed_edge = wdh.gt.parse_edge_abbrev(k)
        if 'P' in parsed_edge:
            idx = parsed_edge.index('P')
            parsed_edge = list(parsed_edge)
            parsed_edge[idx] = 'G'
            k = ''.join(parsed_edge)

        if 'type' not in v.columns:
            v['type'] = h.edge_abv_to_full[parsed_edge[1]] + '_' + k

        edges.append(v)

    # Combine the edges into a single dataframe
    edges = pd.concat(edges, sort=False).reset_index(drop=True)
    col_order = ['start_id', 'end_id', 'type', 'qualifier']
    col_order = col_order + [c for c in col_name_map.values() if c not in col_order]
    edges = edges[col_order]
    
    return edges

## Fixing nodes that are duplicated across two different Node Types

In [23]:
def find_combos(nodes):
    duplicated_nodes = nodes[nodes.duplicated(keep=False, subset=['id'])]['id'].unique()
    # Find out what types are being combined...
    combos = (nodes.query('id in @duplicated_nodes')
                   .sort_values(['id', 'label'])
                   .groupby('id')['label']
                   .apply(list)
                   .astype(str)
                   .to_frame()
                   .reset_index())
    
    return combos

In [24]:
def uniquify_node_types(nodes, edges, type_fix_map=None, verbose=True):
    
    # Set a default value for the map
    if type_fix_map is None:
        
        type_fix_map = {"['Structural Motif', 'Super-Secondary Structure']": 'Structural Motif',
                    "['Chemical Hazard', 'Disease']": 'Chemical Hazard',
                    "['Disease', 'Symptom']": 'Symptom',
                    "['Sequence Variant', 'Symptom']": 'Symptom',
                    "['Disease', 'Sequence Variant', 'Symptom']": 'Symptom',
                    "['Compound', 'Gene']": 'Compound',
                    "['Chemical Role', 'Compound']": 'Compound',
                    "['Biological Process', 'Disease']": 'Disease',
                    "['Anatomical Structure', 'Cellular Component']": 'Cellular Component',
                    "['Protein Domain', 'Structural Motif', 'Super-Secondary Structure']": 'Protein Domain',
                    "['Protein Domain', 'Protein Family']": 'Protein Family',
                    "['Gene', 'Protein Family']": 'Gene',
                    "['Disease', 'Sequence Variant']": 'Disease'
                   }
        
    # Find out what's combined...
    combos = find_combos(nodes)
    # Map from the original combination to resolved type
    final_types = combos.set_index('id')['label'].map(type_fix_map).to_dict()
    
    # Fill in types for already unique nodes and map
    final_types = {**nodes.set_index('id')['label'].to_dict(), **final_types}
    nodes['label'] = nodes['id'].map(final_types)
    
    if verbose:
        print('Number of nodes before fixing: {:,}'.format(len(nodes)))
    nodes = nodes.drop_duplicates().reset_index(drop=True)
    if verbose:
        print('Number of nodes after fixing: {:,}'.format(len(nodes)))

    # Now check that the node types in the edge abbreviation match the newly resolved node types 
    combo = wdh.gt.combine_nodes_and_edges(nodes, edges)
    
    combo['edge_abv'] = combo['type'].apply(lambda t: t.split('_')[-1])
    combo['actual_start'] = combo['edge_abv'].apply(lambda a: h.node_abv_to_full[wdh.gt.parse_edge_abbrev(a)[0]])
    combo['actual_end'] = combo['edge_abv'].apply(lambda a: h.node_abv_to_full[wdh.gt.parse_edge_abbrev(a)[2]])
    
    bad_edge = combo.query('start_label != actual_start or end_label != actual_end')
    
    if verbose:
        print('Number of edges with issues to be removed: {:,}'.format(len(bad_edge)))
        print('Number of edges before fixing: {:,}'.format(len(edges)))
    
    edges = edges.drop(bad_edge.index).reset_index(drop=True)

    if verbose:
        print('Number of edges after fixing: {:,}'.format(len(edges)))

    return nodes, edges

In [25]:
def build_hetnet(q_result):
    node_ids, id_to_name, self_ref, qualified, multi_step = process_query_res(q_result)
    
    fixed_self_ref = fix_self_ref_edges(q_result, self_ref, id_to_name)
    fixed_multi_step = process_multi_step_edges(q_result, qualified, multi_step)
    
    nodes = build_hetnet_nodes(node_ids, id_to_name)
    edges = build_hetnet_edges(q_result, {**fixed_multi_step, **fixed_self_ref})
    
    # merge the genes and proteins in the nodes file
    idx = nodes.query('label == "Protein"').index
    nodes.loc[idx, 'label'] = 'Gene'

    nodes, edges = uniquify_node_types(nodes, edges)
    
    return nodes, edges


In [26]:
for date, q_result in results.items():
    out_dir.joinpath(date).mkdir(exist_ok=True, parents=True)
    
    print('DUMP DATE: {}'.format(date))
    
    nodes, edges = build_hetnet(q_result)

    wdh.gt.add_colons(nodes).to_csv(out_dir.joinpath(date, 'nodes.csv'), index=False)
    wdh.gt.add_colons(edges).to_csv(out_dir.joinpath(date, 'edges.csv'), index=False)

    print('\n\n')

DUMP DATE: 2018-11-12


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


Number of nodes before fixing: 88,404
Number of nodes after fixing: 88,076
Number of edges with issues to be removed: 38,029
Number of edges before fixing: 582,565
Number of edges after fixing: 544,536



DUMP DATE: 2018-02-05


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


Number of nodes before fixing: 66,444
Number of nodes after fixing: 66,139
Number of edges with issues to be removed: 37,289
Number of edges before fixing: 510,202
Number of edges after fixing: 472,913



DUMP DATE: 2017-01-16


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


Number of nodes before fixing: 42,389
Number of nodes after fixing: 42,355
Number of edges with issues to be removed: 370
Number of edges before fixing: 135,576
Number of edges after fixing: 135,206



DUMP DATE: 2019-09-13


HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))


Number of nodes before fixing: 96,423
Number of nodes after fixing: 96,305
Number of edges with issues to be removed: 4,322
Number of edges before fixing: 722,084
Number of edges after fixing: 717,762



