# Add edge semantic abbrevations and map to Biolink

Abbreviations are needed for some of the machine learning parts, to distinguish edge and node semanitcs in the hetnet. We will add them now.  We will also map node semantic types to the Biolink model

In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt

from data_tools.plotting import count_plot_h
from data_tools import combine_nodes_and_edges
from data_tools.df_processing import combine_group_cols_on_char, expand_col_on_char
from data_tools.graphs import map_edge_types_from_file, add_abbrevs, get_core_network

  from tqdm.autonotebook import tqdm


In [2]:
prev_dir = Path('../2_pipeline/12c_Preprocessing-Pruning/out').resolve()
data_dir = Path('../0_data/manual/').resolve()

In [3]:
nodes = pd.read_csv(prev_dir.joinpath('nodes.csv'), dtype=str)
edges = pd.read_csv(prev_dir.joinpath('edges.csv'), dtype=str)

## Add Abbreviations

In [4]:
biolink_map = pd.read_csv(data_dir.joinpath('hetnet_to_biolink.csv'))
sem_info = pd.read_csv(data_dir.joinpath('edge_semtypes.csv'))

In [5]:
sem_info

Unnamed: 0,fwd_edge,abbrev,rev_edge,rel_dir,directed,parent_rel
0,activates,a,activated_by,1,True,affects
1,affects,af,affected_by,0,True,
2,associated_with,aw,associated_with,0,False,
3,capable_of,co,performed_by,0,False,
4,causes,c,caused_by,1,True,
5,diagnoses,dg,diagnosed_by,1,True,
6,disrupts,d,disrupted_in,-1,True,
7,enables,e,enabled_by,1,True,
8,follows_in_sequence,f,precedes_in_sequence,0,True,
9,has_input,hi,input_of,0,False,


In [6]:
rev_map = sem_info.set_index('fwd_edge')['rev_edge'].to_dict()
directed_map = sem_info.set_index('fwd_edge')['directed'].to_dict()

node_abbrev_map = biolink_map.set_index('hetnet')['abbrev'].to_dict()
edge_abbrev_map = sem_info.set_index('fwd_edge')['abbrev'].to_dict()

In [7]:
%%time
edges = add_abbrevs(nodes, edges, {**node_abbrev_map, **edge_abbrev_map}, directed_map)
edges.head(2)

CPU times: user 12.6 s, sys: 3.22 s, total: 15.8 s
Wall time: 15.8 s


In [8]:
edges['type_no_abbv'] = edges['type']
edges['type'] = edges['type'] + '_' + edges['abbrev']
edges.head(2)

Unnamed: 0,start_id,end_id,type,dsrc_type,comp_type,p_val,adj_p,source,license,experiments,...,pmids,phase,date,name,name_x,name_y,merge_id,reactome_id,abbrev,type_no_abbv
0,UNII:BTY153760O,NCBIGene:3605,inhibits_CinG,computed,merge,,,WikiData,CC0 1.0,,...,,,,,,,,,CinG,inhibits
1,CHEBI:10056,NCBIGene:1129,activates_RaG,computed,merge,,,WikiData,CC0 1.0,,...,,,,,,,,,RaG,activates


#### Save network

In [9]:
edges['abbrev'].nunique()

208

In [10]:
this_name = '12d_Preprocessing-Abbreviations_and_Biolink'

out_dir = Path('../2_pipeline/').joinpath(this_name).joinpath('out').resolve()

out_dir.mkdir(parents=True, exist_ok=True)

nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)
edges.to_csv(out_dir.joinpath('edges.csv'), index=False)

# Map to biolink

In [11]:
nodes_biolink = nodes.copy()
edges_biolink = edges.copy()

In [12]:
label_to_biolink = biolink_map.set_index('hetnet')['bl_hierarchy_mapping'].to_dict()
node_abbrev_map = biolink_map.set_index('bl_hierarchy_mapping')['hie_abbrev'].to_dict()

nodes_biolink['label'] = nodes_biolink['label'].map(label_to_biolink)

In [13]:
# reset the edge semmantics
edges_biolink['type'] = edges_biolink['type_no_abbv']

# Add in the new abbrevations
edges_biolink = add_abbrevs(nodes_biolink, edges_biolink, {**node_abbrev_map, **edge_abbrev_map}, directed_map)
edges_biolink.head(2)

Unnamed: 0,start_id,end_id,type,dsrc_type,comp_type,p_val,adj_p,source,license,experiments,...,pmids,phase,date,name,name_x,name_y,merge_id,reactome_id,abbrev,type_no_abbv
0,UNII:BTY153760O,NCBIGene:3605,inhibits,computed,merge,,,WikiData,CC0 1.0,,...,,,,,,,,,CinG,inhibits
1,CHEBI:10056,NCBIGene:1129,activates,computed,merge,,,WikiData,CC0 1.0,,...,,,,,,,,,CaG,activates


In [14]:
edges_biolink['abbrev'].nunique()

68

In [15]:
edges_biolink['type_no_abbv'] = edges_biolink['type']
edges_biolink['type'] = edges_biolink['type'] + '_' + edges_biolink['abbrev']

In [16]:
edges_biolink.head(2)

Unnamed: 0,start_id,end_id,type,dsrc_type,comp_type,p_val,adj_p,source,license,experiments,...,pmids,phase,date,name,name_x,name_y,merge_id,reactome_id,abbrev,type_no_abbv
0,UNII:BTY153760O,NCBIGene:3605,inhibits_CinG,computed,merge,,,WikiData,CC0 1.0,,...,,,,,,,,,CinG,inhibits
1,CHEBI:10056,NCBIGene:1129,activates_CaG,computed,merge,,,WikiData,CC0 1.0,,...,,,,,,,,,CaG,activates


### Ensure Biolink Mapping hasn't resulted in Reverse Types

For example, if we had `Gene` `part_of` `Cellular Component` and `Cellular Component` `has_part` `Protein`,  Mapping to biolkink would result in `AnatomicalEntity` `has_part` `MacromolecularMachine` and `MacromolecularMachine` `part_of` `AnatomicalEntity`, which are essentially the same edges just reversed.  These need to be normalized if identified.

In [17]:
combo = combine_nodes_and_edges(nodes_biolink, edges_biolink)
combo['rev_type'] = combo['type_no_abbv'].map(rev_map)

In [18]:
# Don't need to do these operations on the entire set of edges, just one of each type is ok
meta_edges = combo.drop_duplicates(subset=['start_label', 'type_no_abbv', 'end_label']).copy()

In [19]:
meta_edges['fwd_tup'] = meta_edges[['start_label', 'type_no_abbv', 'end_label']].apply(tuple, axis=1)
meta_edges['rev_tup'] = meta_edges[['end_label', 'rev_type', 'start_label']].apply(tuple, axis=1)

In [20]:
overlaps = list(set(meta_edges['fwd_tup']) & set(meta_edges['rev_tup']))
overlaps

[]

Empty list is exactly what we were looking for... Perfect!

## Save out

In [21]:
'{:,}'.format(len(nodes_biolink))

'250,035'

In [22]:
'{:,}'.format(len(edges_biolink))

'9,652,116'

In [23]:
ex = expand_col_on_char(edges_biolink, 'source', '|')

In [24]:
ex['source'].value_counts()

CTD                         6312738
Gene Ontology               1492662
WikiData                    1185514
Reactome                     617541
Human Phenotype Ontology     201656
miRTarBase                    94610
Protein Ontology              76150
Inxight Drugs                 70514
GAUSS                         31662
ensembl                       26744
InterPro                      25869
RheaDB                        21731
DrugBank                      14615
DrugCentral                   13084
Disease Ontology               4565
ensembel                       4320
ComplexPortal                  1999
Cell Ontology                   855
UBERON                          492
Name: source, dtype: int64

In [25]:
nodes_biolink.to_csv(out_dir.joinpath('nodes_biolink.csv'), index=False)
edges_biolink.to_csv(out_dir.joinpath('edges_biolink.csv'), index=False)