# Semmantic Compression

Based on metaedges and counts (created in the previous notebook), we made a semmantic compression map. This will allow us to map similarly typed edges together to create a more limited semmantic vocabularly.

In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt

from data_tools.plotting import count_plot_h
from data_tools import combine_nodes_and_edges
from data_tools.df_processing import combine_group_cols_on_char, expand_col_on_char
from data_tools.graphs import map_edge_types_from_file, add_abbrevs, get_core_network

  from tqdm.autonotebook import tqdm


In [2]:
prev_dir = Path('../2_pipeline/11_Reactions_and_Regulation/out').resolve()
data_dir = Path('../0_data/manual/').resolve()

In [3]:
nodes = pd.read_csv(prev_dir.joinpath('nodes.csv'), dtype=str)
edges = pd.read_csv(prev_dir.joinpath('edges.csv'), dtype=str)

In [4]:
sem_comp_map = pd.read_csv(data_dir.joinpath('semantic_compression_map.csv'))

In [5]:
sem_comp_map.dtypes

start_label            object
type                   object
end_label              object
counts                  int64
new_type               object
reverse_node_labels    object
dtype: object

In [6]:
sem_comp_map

Unnamed: 0,start_label,type,end_label,counts,new_type,reverse_node_labels
0,Anatomy,capable_of,Biological Process,123,site_of,
1,Anatomy,capable_of_part_of,Biological Process,121,site_of,
2,Anatomy,contains_process,Biological Process,27,site_of,
3,Anatomy,output_of,Biological Process,18,produces,True
4,Anatomy,site_of,Biological Process,13,site_of,
...,...,...,...,...,...,...
600,Protein Family,enables,Molecular Function,9518,part_of,
601,Protein Family,negatively_regulates,Molecular Function,106,negatively_regulates,
602,Protein Family,positively_regulates,Molecular Function,78,positively_regulates,
603,Protein Family,regulates,Molecular Function,58,regulates,


In [7]:
len(sem_comp_map[['start_label', 'new_type', 'end_label']].dropna().drop_duplicates())

221

So we are reducing our total number of metaedges from 605 to 235, about a 60% reduction

The mapping function is in-place, so lets do a quick before and after comparison to make sure things are mapped

In [8]:
len(edges), len(edges.drop_duplicates(subset=['start_id', 'end_id', 'type']))

(8979599, 8979599)

In [9]:
map_edge_types_from_file(edges, sem_comp_map, nodes=nodes)

100%|██████████| 605/605 [03:30<00:00,  2.87it/s]


In [10]:
len(edges), len(edges.drop_duplicates(subset=['start_id', 'end_id', 'type']))

(8947338, 8906037)

In [11]:
edges.head(10)

Unnamed: 0,start_id,end_id,type,dsrc_type,comp_type,p_val,adj_p,source,license,experiments,support_type,pmids,phase,date,name,name_x,name_y,merge_id,reactome_id
0,UNII:BTY153760O,NCBIGene:3605,inhibits,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
1,CHEBI:10055,NCBIGene:153,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
2,CHEBI:10056,NCBIGene:1129,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
3,CHEBI:10056,NCBIGene:1131,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
4,CHEBI:10056,NCBIGene:1133,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
5,CHEBI:10056,NCBIGene:3350,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
6,CHEBI:10056,NCBIGene:3351,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
7,CHEBI:10056,NCBIGene:3352,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
8,CHEBI:10056,NCBIGene:3354,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,
9,CHEBI:10056,NCBIGene:3355,activates,computed,merge,,,WikiData,CC0 1.0,,,,,,,,,,


In [12]:
edges = combine_group_cols_on_char(edges, ['start_id', 'end_id', 'type'], sort=True)

HBox(children=(FloatProgress(value=0.0, description='total_progress', max=6.0, style=ProgressStyle(description…

HBox(children=(FloatProgress(value=0.0, description='dsrc_type', max=40143.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='comp_type', max=40143.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='source', max=40143.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='license', max=40143.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='pmids', max=40143.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='date', max=40143.0, style=ProgressStyle(description_width…





As some edge types were dropped, there may be fewer nodes

In [13]:
edge_ids = edges[['start_id', 'end_id']].stack().unique()

In [14]:
len(nodes), len(nodes.query('id in @edge_ids'))

(790997, 384724)

### Final Checks

Don't need to hold onto legacy nodes anymore... only keep those in the network

In [15]:
nodes = nodes.query('id in @edge_ids')

#### Ensure no duplicated nodes

In [16]:
nodes[nodes['id'].duplicated(keep=False)]

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
790981,DOID:0050269,Trichomonas vaginalis trichomoniasis,,MESH:D014247|ICD10CM:A59.00|NCI:C35083,,urogenital trichomonas,,NCIthesaurus|DO_IEDB_slim
790982,DOID:0050269,Trichomonas vaginalis trichomoniasis,,MESH:D014247|ICD10CM:A59.00|NCI:C35083,,urogenital trichomonas,,NCIthesaurus|DO_IEDB_slim


In [17]:
nodes = nodes.drop_duplicates(subset=['id'], keep='first')

#### Make sure all nodes have a label

In [18]:
nodes[nodes['label'].isnull()]

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
790981,DOID:0050269,Trichomonas vaginalis trichomoniasis,,MESH:D014247|ICD10CM:A59.00|NCI:C35083,,urogenital trichomonas,,NCIthesaurus|DO_IEDB_slim
790983,DOID:0070344,ocular tuberculosis,,,,,,
790984,DOID:0080602,benign teratoma,,NCI:C67107,,,,
790985,DOID:0080615,nephroma,,,,benign nephroma,,
790986,DOID:0080616,kidney cortex disease,,,,,,
790987,DOID:0080617,lymph node benign neoplasm,,SNOMEDCT_US_2020_03_01:92197001|NCI:C3636,,,,
790988,DOID:0080618,lymph node carcinoma,,,,,,
790989,DOID:0080619,auditory system benign neoplasm,,NCI:C8417,,,,
790990,DOID:0080638,B-cell acute lymphoblastic leukemia,,NCI:C8644,,B-cell acute lymphocytic leukemia|B acute lymp...,,DO_cancer_slim
790991,DOID:0080640,gallbladder benign neoplasm,,NCI:C4440,,,,


All are diseases

In [19]:
disease_idx = nodes[nodes['label'].isnull()].index
nodes.loc[disease_idx, 'label'] = 'Disease'

nodes.loc[disease_idx].head()

Unnamed: 0,id,name,label,xrefs,source,synonyms,alt_ids,subsets
790981,DOID:0050269,Trichomonas vaginalis trichomoniasis,Disease,MESH:D014247|ICD10CM:A59.00|NCI:C35083,,urogenital trichomonas,,NCIthesaurus|DO_IEDB_slim
790983,DOID:0070344,ocular tuberculosis,Disease,,,,,
790984,DOID:0080602,benign teratoma,Disease,NCI:C67107,,,,
790985,DOID:0080615,nephroma,Disease,,,benign nephroma,,
790986,DOID:0080616,kidney cortex disease,Disease,,,,,


### Save results

In [20]:
this_name = '12a_Preprocessing-Semmantic_Compression'

out_dir = Path('../2_pipeline/').joinpath(this_name).joinpath('out').resolve()

out_dir.mkdir(parents=True, exist_ok=True)

nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)
edges.to_csv(out_dir.joinpath('edges.csv'), index=False)