In [2]:
import pandas as pd
import numpy as np
from collections import Counter

def fill_missing_cols(df):
    
    if 'node_id' not in df.columns:
        raise ValueError('Must have at least a "node_id" column.')
        
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
   
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)
    nan_cols_df.index = df.index
    return pd.concat([df,nan_cols_df],axis=1)

# Kids First data processing notebook
## There are KF 2 datasets: 
1. A `Phenotype/DeIdentified_Patient_ID/Cohort` dataset (processed here)
2. A dataset containing `binned counts of variants` from the CHD cohort, binned according to the Chromosome Location Ontology (CHLO). This workflow was **completed on the Kids First Variant Workbench** and can be found in 2 zeppelin notebooks (DataDistillery/get_variants and DataDistillery/create_nodes_edges. I save the processed nodes and edge files on Cavatica and then download them to my local computer. At the bottom of this notebook I load these files in and join them with the `Phenotype/Patient_ID/Cohort`'s datasets nodes and edges files

In [2]:
# Load in phenotype-patient data
df = pd.read_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/KidsFirst/data-1686765441137.csv')
#df.isna().sum()
df.dropna(inplace=True)
df.sample(5)

Unnamed: 0,hpo_id_phenotype,source_text_phenotype,patient_id,study_id
32156,HP:0007875,Congenital Blindness,PT_0TEX2MYH,SD_PREASA7S
28907,HP:0004383,Hypoplastic Left Heart Syndrome,PT_SP2K1TJG,SD_PREASA7S
20607,HP:0001711,Abnormal Left Ventricle,PT_0BB4MN8K,SD_PREASA7S
36982,HP:0011620,Abnormal Abdominal Situs,PT_0VFSR0T7,SD_PREASA7S
243,HP:0000023,Inguinal Hernia,PT_EG28YQ73,SD_PREASA7S


In [3]:
Counter([i for i in df['hpo_id_phenotype'] if ':' not in i])


Counter({'0000957': 1,
         '0000964': 1,
         '0002019': 1,
         '0007359': 1,
         '0012452': 1,
         'HP_0100659': 1,
         'No Match': 307,
         'Not Applicable': 2,
         'Not Reported': 3893,
         '**see notes': 1,
         '\xa0NA': 1})

In [3]:
exclude = df[~df['hpo_id_phenotype'].str.contains(':')]['hpo_id_phenotype'].drop_duplicates().values

# Remove 
df = df[~df['hpo_id_phenotype'].isin(exclude)]

In [4]:
# Read in KF STUDY NAME -- ID MAPPINGS.
# We will merge in the names of the studies to use as node_labels
# Merge on study_id
kf_map = pd.read_csv('/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/KidsFirst/KF_study_id_name_mapping.txt'
            ,sep='\t')
kf_map = kf_map.rename(columns={'kf_id':'study_id'})

In [5]:
df = pd.merge(df,kf_map,how='left',on= 'study_id')

In [7]:
# Didnt find these studies in the mapping file
Counter(df[df['short_name'].isna()]['study_id'].values)
[i for i in df[df['short_name'].isna()]['study_id'].unique() if i in kf_map['study_id'].unique()]

[]

In [8]:
kf_map.isna().sum()

study_id      0
short_name    0
short_code    0
dtype: int64

In [6]:
# Drop ~2k NAs
df = df[~df['short_name'].isna()]

In [7]:
df['participant_id'] = ['KFPT '+i.replace('_','-') for i in df['patient_id']]
#df['hpo_id_phenotype'] = ['HPO '+i for i in df['hpo_id_phenotype']]

df['study_id'] = ['KFCOHORT '+i.replace('_','-') for i in df['study_id']]

df = df.rename(columns={'participant_id':'KFPT_CodeID',
                        'hpo_id_phenotype':'HPO_CodeID',
                        'study_id':'KFCOHORT_CodeID'})

df = df.drop('source_text_phenotype',axis=1)
df.sample(10)

Unnamed: 0,HPO_CodeID,patient_id,KFCOHORT_CodeID,short_name,short_code,KFPT_CodeID
3775,HP:0000119,PT_MA8RF43A,KFCOHORT SD-DZTB5HRR,Kids First: Syndromic Cranial Dysinnervation,KF-SCD,KFPT PT-MA8RF43A
30988,HP:0006704,PT_2JS9WP8N,KFCOHORT SD-PREASA7S,Kids First: Congenital Heart Defects,KF-CHD,KFPT PT-2JS9WP8N
18090,HP:0001646,PT_VJF5GAVP,KFCOHORT SD-PREASA7S,Kids First: Congenital Heart Defects,KF-CHD,KFPT PT-VJF5GAVP
3921,HP:0000119,PT_TAE6T5DP,KFCOHORT SD-DZTB5HRR,Kids First: Syndromic Cranial Dysinnervation,KF-SCD,KFPT PT-TAE6T5DP
32154,HP:0007875,PT_0XYCMAHJ,KFCOHORT SD-PREASA7S,Kids First: Congenital Heart Defects,KF-CHD,KFPT PT-0XYCMAHJ
23757,HP:0002021,PT_7092QVTM,KFCOHORT SD-PREASA7S,Kids First: Congenital Heart Defects,KF-CHD,KFPT PT-7092QVTM
15865,HP:0001629,PT_GQ25VWAN,KFCOHORT SD-PREASA7S,Kids First: Congenital Heart Defects,KF-CHD,KFPT PT-GQ25VWAN
7923,HP:0000256,PT_BXKJ3MX3,KFCOHORT SD-PREASA7S,Kids First: Congenital Heart Defects,KF-CHD,KFPT PT-BXKJ3MX3
26709,HP:0002744,PT_X419D6ZH,KFCOHORT SD-DK0KRWK8,Kids First: Orofacial Cleft: African and Asian...,KF-OFCAA,KFPT PT-X419D6ZH
43975,HP:0032092,PT_9P6T879P,KFCOHORT SD-PREASA7S,Kids First: Congenital Heart Defects,KF-CHD,KFPT PT-9P6T879P


# Create edges (2 types)
1. KF patient node --- has_phenotype --- HPO node
2. KF patient node --- belongs_to_cohort --- KF cohort node

In [8]:
df['predicate'] = 'has_phenotype'
df['cohort_predicate'] = 'belongs_to_cohort'

# (1)  create KF patient node to HPO node edges
edges_phenos = df[['KFPT_CodeID','predicate','HPO_CodeID']]
edges_phenos.columns = ['subject','predicate','object']

# (2) Create KF patient node to KF cohort node
edges_cohort = df[['KFPT_CodeID','cohort_predicate','KFCOHORT_CodeID']]
edges_cohort.columns = ['subject','predicate','object']

edges_all = pd.concat([edges_cohort,edges_phenos])
edges_all.drop_duplicates().reset_index(drop=True)
edges_all.sample(10)

Unnamed: 0,subject,predicate,object
1847,KFPT PT-QZBMXGSK,belongs_to_cohort,KFCOHORT SD-PREASA7S
10283,KFPT PT-Q067W7JY,belongs_to_cohort,KFCOHORT SD-PREASA7S
27339,KFPT PT-R9K6B7EQ,has_phenotype,HP:0002786
10595,KFPT PT-89RPNXBY,belongs_to_cohort,KFCOHORT SD-PREASA7S
27292,KFPT PT-N37Y1Z0J,belongs_to_cohort,KFCOHORT SD-PREASA7S
40542,KFPT PT-DF7ZSCV6,has_phenotype,HP:0030319
30010,KFPT PT-05YE9CHY,has_phenotype,HP:0006101
36326,KFPT PT-6WGDSBFC,has_phenotype,HP:0011563
35541,KFPT PT-X99T48YH,has_phenotype,HP:0009921
427,KFPT PT-ZFFW9G3Z,has_phenotype,HP:0000023


# Create nodes (2 sets)
1. KF Patient nodes (SAB = `KFPT`)
2. KF Cohort nodes (SAB = `KFCOHORT`), cohort nodes will include the 'short_name' column as node_label

In [9]:
# Create (1)  KF Patient nodes

nodes_patient = df['KFPT_CodeID'].drop_duplicates().to_frame()
nodes_patient.columns = ['node_id']

In [10]:
nodes_patient

Unnamed: 0,node_id
0,KFPT PT-9X741E8Z
1,KFPT PT-0AQN56EH
2,KFPT PT-1HNTASHD
3,KFPT PT-2KE662T4
4,KFPT PT-2Q0TYD81
...,...
45991,KFPT PT-MDPKN7XY
45992,KFPT PT-MEZVJHNW
45993,KFPT PT-MF0B9B2M
47352,KFPT PT-S6NCWK9W


In [11]:
nodes_patient = fill_missing_cols(nodes_patient)

In [12]:
# Create (2)  KF Cohort nodes
nodes_cohort = df[['KFCOHORT_CodeID','short_name']]\
                .rename(columns={'KFCOHORT_CodeID':'node_id','short_name':'node_label'})\
                .drop_duplicates().reset_index(drop=True)

nodes_cohort = fill_missing_cols(nodes_cohort)

In [13]:
nodes_all = pd.concat([nodes_patient,nodes_cohort])

In [14]:
edges_all = edges_all.drop_duplicates().reset_index(drop=True)
nodes_all = nodes_all.drop_duplicates().reset_index(drop=True)

In [15]:
#nodes_all['node_label'] = 
#Counter([i if type(i) is not str else i.replace(':','-') for i in nodes_all['node_label']])

In [19]:
Counter([i.split(' ')[0] for i in nodes_all['node_id']])

Counter({'KFPT': 5329, 'KFCOHORT': 15})

In [20]:
edges_all

Unnamed: 0,subject,predicate,object
0,KFPT PT-9X741E8Z,belongs_to_cohort,KFCOHORT SD-0TYVY1TW
1,KFPT PT-0AQN56EH,belongs_to_cohort,KFCOHORT SD-NMVV8A1Y
2,KFPT PT-1HNTASHD,belongs_to_cohort,KFCOHORT SD-NMVV8A1Y
3,KFPT PT-2KE662T4,belongs_to_cohort,KFCOHORT SD-NMVV8A1Y
4,KFPT PT-2Q0TYD81,belongs_to_cohort,KFCOHORT SD-NMVV8A1Y
...,...,...,...
49935,KFPT PT-Q00GK4N8,has_phenotype,HPO HP:0410279
49936,KFPT PT-PNTG28PE,has_phenotype,HPO HP:0410287
49937,KFPT PT-T1XMBM7G,has_phenotype,HPO HP:0410287
49938,KFPT PT-2WJQ4XXV,has_phenotype,HPO HP:0410287


### Save KF phenotype files

In [16]:
path= '/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/KidsFirst/kf_phenotypes/'

edges_all.to_csv(path+'OWLNETS_edgelist.txt',index=False,sep='\t')
nodes_all.to_csv(path+'OWLNETS_node_metadata.txt',index=False,sep='\t')

In [17]:
np.unique(edges_all['subject'].values + edges_all['object'].values)

array(['KFPT PT-0059H2XTHP:0000118', 'KFPT PT-0059H2XTHP:0000119',
       'KFPT PT-0059H2XTHP:0000364', ..., 'KFPT PT-ZZSR2WNGHP:0012824',
       'KFPT PT-ZZSR2WNGHP:0012831',
       'KFPT PT-ZZSR2WNGKFCOHORT SD-B8X3C1MX'], dtype=object)

# Read in Variant bins KF nodes and edges files that were processed on VWB

In [18]:
path= '/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/KidsFirst/'

varbin_nodes = pd.read_csv(path+'variant_bin_counts/OWLNETS_node_metadata.txt',sep='\t')
#varbin_edges = pd.read_csv(path+'variant_bin_counts/OWLNETS_edgelist.txt',sep='\t')

varbin_edges = pd.read_pickle(path+'variant_bin_counts/OWLNETS_edgelist.pickle')
varbin_edges

Unnamed: 0,subject,predicate,object
0,KFGENEBIN DUX4-variant-count,gene_has_variants,HGNC HGNC:50800
1,KFGENEBIN TTC34-variant-count,gene_has_variants,HGNC HGNC:34297
2,KFGENEBIN ANKRD36C-variant-count,gene_has_variants,HGNC HGNC:32946
3,KFGENEBIN FRG2C-variant-count,gene_has_variants,HGNC HGNC:33626
4,KFGENEBIN ZNF717-variant-count,gene_has_variants,HGNC HGNC:29448
...,...,...,...
26745,KFGENEBIN BMP4-variant-count,belongs_to_cohort,KFCOHORT SD-PREASA7S
26746,KFGENEBIN ADM5-variant-count,belongs_to_cohort,KFCOHORT SD-PREASA7S
26747,KFGENEBIN TMEM115-variant-count,belongs_to_cohort,KFCOHORT SD-PREASA7S
26748,KFGENEBIN MFSD4A-variant-count,belongs_to_cohort,KFCOHORT SD-PREASA7S


In [19]:
varbin_nodes[varbin_nodes['node_id'].str.contains('MGAT5')]

Unnamed: 0,node_id,value,lowerbound,unit,node_dbxrefs,node_namespace,node_definition,node_label,upperbound,node_synonyms
325,KFGENEBIN MGAT5-variant-count,76.0,,,,,,,,
6027,KFGENEBIN MGAT5B-variant-count,12.0,,,,,,,,


# Concat the 2 sets of nodes/edges into a master set

In [20]:
nodes_master = pd.concat([nodes_all,varbin_nodes]).reset_index(drop=True)
edges_master = pd.concat([edges_all,varbin_edges]).reset_index(drop=True)

nodes_master = nodes_master.drop_duplicates()
edges_master = edges_master.drop_duplicates()

In [21]:
nodes_master['node_id'] = [i.replace('HGNC HGNC:','HGNC ') for i in nodes_master['node_id']]

In [22]:
for col in edges_master.columns:
        edges_master[col] = [i.replace('HGNC HGNC:','HGNC ') for i in edges_master[col]]

In [24]:
path

'/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/KidsFirst/'

In [23]:
edges_master.to_csv(path+'combined_files/OWLNETS_edgelist.txt',index=False,sep='\t')
nodes_master.to_csv(path+'combined_files/OWLNETS_node_metadata.txt',index=False,sep='\t')

In [29]:
path= '/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/KidsFirst/'

In [30]:
nodes = pd.read_csv(path+'combined_files/OWLNETS_node_metadata.txt',sep='\t')
edges = pd.read_csv(path+'combined_files/OWLNETS_edgelist.txt',sep='\t')


In [8]:
[i for i in nodes['node_id']]

Unnamed: 0,node_id,upperbound,node_label,lowerbound,node_definition,node_dbxrefs,node_namespace,value,node_synonyms,unit
0,KFPT PT-9X741E8Z,,,,,,,,,
1,KFPT PT-0AQN56EH,,,,,,,,,
2,KFPT PT-1HNTASHD,,,,,,,,,
3,KFPT PT-2KE662T4,,,,,,,,,
4,KFPT PT-2Q0TYD81,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
76700,HGNC HGNC:1071,,,,,,,,,
76701,HGNC HGNC:27293,,,,,,,,,
76702,HGNC HGNC:30055,,,,,,,,,
76703,HGNC HGNC:25433,,,,,,,,,


In [31]:
nodes[nodes['node_id'].str.startswith('HGNC')]

Unnamed: 0,node_id,value,node_dbxrefs,lowerbound,upperbound,node_synonyms,node_label,node_namespace,node_definition,unit


In [32]:
edges[edges['object'].str.startswith('HGNC')]

Unnamed: 0,subject,predicate,object
49940,KFGENEBIN DUX4-variant-count,gene_has_variants,HGNC 50800
49941,KFGENEBIN TTC34-variant-count,gene_has_variants,HGNC 34297
49942,KFGENEBIN ANKRD36C-variant-count,gene_has_variants,HGNC 32946
49943,KFGENEBIN FRG2C-variant-count,gene_has_variants,HGNC 33626
49944,KFGENEBIN ZNF717-variant-count,gene_has_variants,HGNC 29448
...,...,...,...
63310,KFGENEBIN BMP4-variant-count,gene_has_variants,HGNC 1071
63311,KFGENEBIN ADM5-variant-count,gene_has_variants,HGNC 27293
63312,KFGENEBIN TMEM115-variant-count,gene_has_variants,HGNC 30055
63313,KFGENEBIN MFSD4A-variant-count,gene_has_variants,HGNC 25433


In [53]:
notin = []
for n in set(np.concatenate([edges['subject'].values, edges['object'].values])):
    if n not in set(nodes['node_id']):
        notin.append(n)

In [57]:
np.unique([i.split(' ')[0] for i in notin])

array(['HGNC', 'HPO'], dtype='<U4')

In [80]:
# Read in nodes and add hgnc and hpo 
nodes = pd.read_csv(path+'combined_files/OWLNETS_node_metadata.txt',sep='\t')
edges = pd.read_csv(path+'combined_files/OWLNETS_edgelist.txt',sep='\t')

hgnc_hpo_nodes = pd.DataFrame(edges[edges['object'].str.startswith(('HGNC','HPO'))]['object'],columns=['node_id'])
hgnc_hpo_nodes = pd.DataFrame(edges[edges['object'].str.startswith(('HGNC','HPO'))]['object'])
hgnc_hpo_nodes.columns = ['node_id']
hgnc_hpo_nodes = fill_missing_cols(hgnc_hpo_nodes)

In [81]:
nodes = pd.concat([nodes,hgnc_hpo_nodes])
nodes.to_csv(path+'combined_files/OWLNETS_node_metadata.txt',index=False,sep='\t')

In [82]:
nodes

Unnamed: 0,node_id,upperbound,node_label,lowerbound,node_definition,node_dbxrefs,node_namespace,value,node_synonyms,unit
0,KFPT PT-9X741E8Z,,,,,,,,,
1,KFPT PT-0AQN56EH,,,,,,,,,
2,KFPT PT-1HNTASHD,,,,,,,,,
3,KFPT PT-2KE662T4,,,,,,,,,
4,KFPT PT-2Q0TYD81,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
63310,HGNC HGNC:1071,,,,,,,,,
63311,HGNC HGNC:27293,,,,,,,,,
63312,HGNC HGNC:30055,,,,,,,,,
63313,HGNC HGNC:25433,,,,,,,,,


In [39]:
Counter([i.split(' ')[0] for i in edges['subject']])

Counter({'KFPT': 49940, 'KFGENEBIN': 26750})

In [41]:
Counter([i.split(' ')[0] for i in edges['object']])

Counter({'KFCOHORT': 18704, 'HPO': 44611, 'HGNC': 13375})

In [30]:
c='/Users/stearb/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-90e3a101-4416-4df0-9863-b8e100970fac/import'
codes= pd.read_csv(c+'/CODEs.csv')

  codes= pd.read_csv(c+'/CODEs.csv')


In [35]:
codes[codes['CodeID:ID'].str.startswith('KF')]

Unnamed: 0,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
19907401,KFPT PT-9X741E8Z,KFPT,PT-9X741E8Z,,,,
19907402,KFPT PT-0AQN56EH,KFPT,PT-0AQN56EH,,,,
19907403,KFPT PT-1HNTASHD,KFPT,PT-1HNTASHD,,,,
19907404,KFPT PT-2KE662T4,KFPT,PT-2KE662T4,,,,
19907405,KFPT PT-2Q0TYD81,KFPT,PT-2Q0TYD81,,,,
...,...,...,...,...,...,...,...
19982777,KFVARBIN chr8.94711001-94712000,KFVARBIN,chr8.94711001-94712000,2.0,,,
19982778,KFVARBIN chr7.57481001-57482000,KFVARBIN,chr7.57481001-57482000,2.0,,,
19982779,KFVARBIN chr7.73510001-73511000,KFVARBIN,chr7.73510001-73511000,2.0,,,
19982780,KFVARBIN chr3.125071001-125072000,KFVARBIN,chr3.125071001-125072000,2.0,,,
