# Semmantic Hierarchy Mapping

Some edge types are more spefici than others... For example `inhibits` is more specific than the general `affects`. However, both could be considered as `affects` edges, so to ensure that our more specific edges carry the appropriate weight, we will duplicate them with the more general semantic type

In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt

from data_tools.plotting import count_plot_h
from data_tools import combine_nodes_and_edges
from data_tools.df_processing import combine_group_cols_on_char, expand_col_on_char
from data_tools.graphs import map_edge_types_from_file, add_abbrevs, get_core_network

  from tqdm.autonotebook import tqdm


In [2]:
prev_dir = Path('../2_pipeline/12a_Preprocessing-Semmantic_Compression/out').resolve()
data_dir = Path('../0_data/manual/').resolve()

In [3]:
nodes = pd.read_csv(prev_dir.joinpath('nodes.csv'), dtype=str)
edges = pd.read_csv(prev_dir.joinpath('edges.csv'), dtype=str)

## Add Abbreviations

In [4]:
sem_info = pd.read_csv(data_dir.joinpath('edge_semtypes.csv'))

In [5]:
sem_info

Unnamed: 0,fwd_edge,abbrev,rev_edge,rel_dir,directed,parent_rel
0,activates,a,activated_by,1,True,affects
1,affects,af,affected_by,0,True,
2,associated_with,aw,associated_with,0,False,
3,capable_of,co,performed_by,0,False,
4,causes,c,caused_by,1,True,
5,diagnoses,dg,diagnosed_by,1,True,
6,disrupts,d,disrupted_in,-1,True,
7,enables,e,enabled_by,1,True,
8,follows_in_sequence,f,precedes_in_sequence,0,True,
9,has_input,hi,input_of,0,False,


In [6]:
to_map = sem_info.dropna(subset=['parent_rel'])['fwd_edge'].unique()
rel_map = sem_info.dropna(subset=['parent_rel']).set_index('fwd_edge')['parent_rel'].to_dict()

In [7]:
edges_to_map = edges.query('type in @to_map').copy()

In [8]:
edges_to_map['type'] = edges_to_map['type'].map(rel_map)

In [9]:
edges_out = pd.concat([edges, edges_to_map], sort=False, ignore_index=True)

### Remove duplications

In [10]:
len(edges_out)

9928357

In [11]:
edges_out = combine_group_cols_on_char(edges_out, ['start_id', 'end_id', 'type'], sort=True)

HBox(children=(FloatProgress(value=0.0, description='total_progress', max=5.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='dsrc_type', max=127155.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='source', max=127155.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='license', max=127155.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='pmids', max=127155.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='merge_id', max=127155.0, style=ProgressStyle(description_…





In [12]:
len(edges_out)

9786425

## Save results

In [13]:
this_name = '12b_Preprocessing-Semmantic_Hierarchy_Mapping'

out_dir = Path('../2_pipeline/').joinpath(this_name).joinpath('out').resolve()

out_dir.mkdir(parents=True, exist_ok=True)

nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)
edges_out.to_csv(out_dir.joinpath('edges.csv'), index=False)