In [1]:
import pandas as pd
from pathlib import Path

In [2]:
prev_nw_dir = Path('../2_pipeline/12d_Preprocessing-Abbreviations_and_Biolink/out').resolve()

nodes = pd.read_csv(prev_nw_dir.joinpath('nodes_biolink.csv'), dtype=str)
edges = pd.read_csv(prev_nw_dir.joinpath('edges_biolink.csv'), dtype=str)

## Sample and remove compounds for post testing

In [3]:
# Set the random seed to the date I first wrote this notebook
random_seed = 20200130

treat_compounds = edges.query('type == "treats_CtD"')['start_id'].unique()

# Hold out 20% of the compounds with Treats edges for testing
holdout_compounds = nodes.query('id in @treat_compounds').sample(frac=.2, random_state=random_seed)['id'].tolist()
holdout_edges = edges.query('start_id in @holdout_compounds and type == "treats_CtD"')

n_treat_comp = len(treat_compounds)
n_holdout = len(holdout_compounds)
frac_ho = n_holdout / n_treat_comp

n_treat_edge = len(edges.query('type == "treats_CtD"'))
n_ho_edge = len(holdout_edges)
frac_ho_edge = n_ho_edge / n_treat_edge

print('Removing {:,} of {:,} ({:1.1%}) Compounds known to treat 1 or more Diseases'.format(n_holdout, n_treat_comp, frac_ho))
print('{:,} of {:,} ({:1.1%}) Compound - TREATS - Disease edges removed from network'.format(n_ho_edge, n_treat_edge, frac_ho_edge))

Removing 2,261 of 11,303 (20.0%) Compounds known to treat 1 or more Diseases
13,605 of 69,639 (19.5%) Compound - TREATS - Disease edges removed from network


In [4]:
remain_edges = edges.drop(holdout_edges.index, axis=0)

## Save the network and holdout edges

In [5]:
nb_name = '13a_Model_Prep_Holdout_Set'
out_dir = Path('../2_pipeline/').joinpath(nb_name).joinpath('out').resolve()

out_dir.mkdir(exist_ok=True, parents=True)

In [6]:
nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)
remain_edges.to_csv(out_dir.joinpath('edges.csv'), index=False)
holdout_edges.to_csv(out_dir.joinpath('holdout_set.csv'), index=False)