# This notebook was converted from the create_nodes_edges zeppelin (.zpln) notebook with a command line tool. This was only done so I could put the workflow on github (github cannot render zeppelin notebooks).

# Run this notebook after running get_variants and producing the chromosome files in merged_snps/

In [1]:
import pandas as pd
import numpy as np

def fill_missing_cols(df):
    if 'node_id' not in df.columns:
        raise ValueError('Must have at least a "node_id" column.')
        
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
   
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)
    nan_cols_df.index = df.index
    return pd.concat([df,nan_cols_df],axis=1)
    

In [6]:
#merged_all = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/merged_snps/*.parquet')
#print(merged_all.count())
#merged_all.show()

In [7]:
CHD_merged_genes_inner = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/CHD_merged_genes_inner.parquet')
print(CHD_merged_genes_inner.count())
CHD_merged_genes_inner.show()

In [8]:
CHD_merged_genes_inner = CHD_merged_genes_inner.drop_duplicates()
CHD_merged_genes_inner.count()

In [9]:
# load in gene symbol to hgnc code mappings
hgnc_master = pd.read_csv('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/hgnc_master.txt',sep='\t')
hgnc_master_merge_cols = hgnc_master[['hgnc_id','symbol']]

In [10]:
#CHD_merged_genes_inner.show()

gb = CHD_merged_genes_inner.groupBy('symbol').count().orderBy('count', ascending=False).toPandas()
gb

In [11]:
#merged_all = spark.read.parquet(f"s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/CHD_merged_genes.parquet")
#gb = merged_all.groupBy('node_id').count().orderBy('count', ascending=False).toPandas()


In [12]:
#merged_df[merged_df['ensembl_gene_id']!=None]
#merged_df['ensembl_gene_id'] =
#merged_df[[True if type(i)==str else False for i in merged_df['ensembl_gene_id'] ]]


In [13]:
#merged_all_genes = spark.read.parquet(f"s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/CHD_merged_genes.parquet")


In [14]:
# format code nodes (for KFVARBINS nodes, which are connected to CHLO nodes)

#gb['node_id'] = ['CHLO ' + i.replace(':','.') for i in gb['node_id']]
#gb['KFVARBIN_CodeID'] = ['KFVARBIN '+i.replace(':','.').split(' ')[-1] for i in gb['node_id']]
#gb.head(5)


In [15]:
gb = pd.merge(gb,hgnc_master_merge_cols)

In [16]:
gb

In [17]:
# format code nodes for KFGENEBINS nodes, which are connected to HGNC nodes)

gb['node_id'] = [i.replace(':','-')+'-variant-count' for i in gb['symbol']]        #['CHLO ' + i.replace(':','.') for i in gb['node_id']]
gb['KFGENEBIN_CodeID'] = ['KFGENEBIN '+i for i in gb['node_id']]
gb.head(5)

In [18]:
# Every bin will show up at least once?
#gb = gb[gb['count'] > 1]
#print(len(gb))

In [19]:
nodes = gb[['KFGENEBIN_CodeID','count']].rename(columns={'KFGENEBIN_CodeID':'node_id','count':'value'})
nodes = fill_missing_cols(nodes)

In [20]:
#name_id_df = spark.table('studies').select(['kf_id','short_name','short_code'])
#name_id_df.toPandas().to_csv('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/DataDistillery/KF_study_id_name_mapping.txt',sep='\t',index=False)

In [21]:
gb['HGNC_CodeID'] = ['HGNC ' + i for i in gb['hgnc_id']]

In [22]:
# Define edges
gb['predicate'] = 'gene_has_variants'
edges = gb[['KFGENEBIN_CodeID','predicate','HGNC_CodeID']]
edges.columns = ['subject','predicate','object']


In [23]:
# Define relationships to cohort 
gb['KFCOHORT_CodeID'] = 'KFCOHORT SD-PREASA7S'  
gb['KFCOHORT_node_label'] = 'Kids First: Congenital Heart Defects'
gb['cohort_predicate'] = 'belongs_to_cohort'

edges_cohort = gb[['KFGENEBIN_CodeID','cohort_predicate','KFCOHORT_CodeID']]
edges_cohort.columns = ['subject','predicate','object']

edges_all = pd.concat([edges,edges_cohort])


In [24]:
from collections import Counter
Counter([i.split(' ')[0] for i in edges_all['subject']])

In [25]:
Counter([i.split(' ')[0] for i in edges_all['object']])

In [26]:
edges_all = edges_all.drop_duplicates().reset_index(drop=True)
edges_all.to_pickle('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/DataDistillery/OWLNETS_edgelist.pickle')

In [28]:
edges_all.to_csv('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/DataDistillery/OWLNETS_edgelist.txt',sep='\t',index=False)

In [29]:
nodes_cohort = fill_missing_cols(gb[['KFCOHORT_CodeID','KFCOHORT_node_label']].rename(columns={'KFCOHORT_CodeID':'node_id',
                                                                                                'KFCOHORT_node_label':'node_label'}).drop_duplicates())
nodes_cohort                                                                                                

In [30]:
nodes['value'] = nodes['value'].astype(int)

In [31]:
nodes_all = pd.concat([nodes,nodes_cohort]).drop_duplicates().reset_index(drop=True)
nodes_all = nodes_all.drop_duplicates().reset_index(drop=True)

In [32]:
Counter([i.split(' ')[0] for i in nodes_all['node_id']])

In [33]:
nodes_all.to_csv('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/DataDistillery/OWLNETS_node_metadata.txt',sep='\t',index=False)

In [34]:
nodes_all #.toPandas()

In [35]:
nodes_all