# Run this notebook after running get_variants and producing the chromosome files in merged_snps/



In [1]:
%pyspark
import pandas as pd
import numpy as np

def fill_missing_cols(df):
    if 'node_id' not in df.columns:
        raise ValueError('Must have at least a "node_id" column.')
        
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
   
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)
    nan_cols_df.index = df.index
    return pd.concat([df,nan_cols_df],axis=1)
    

In [2]:
%sh
cd ~/.sevenbridges/
if [[ -e 'credentials' ]]; then mv credentials credentials.bak; fi
cat << EOF > credentials
[default]
api_endpoint = https://cavatica-api.sbgenomics.com/v2
auth_token   = c05edbf9ffe041479f063666e23f675f
EOF


In [3]:
%sh
cd ~
if ! [[ -x 'sbfs' ]]; then curl https://igor.sbgenomics.com/downloads/sbfs/linux-amd64/sbfs -O; chmod 755 sbfs; fi

In [4]:
%sh
cd ~
! [[ -x 'cavatica' ]] && mkdir ~/cavatica
[[ "$(ls -A ~/cavatica)" ]] || ~/sbfs mount ~/cavatica
echo 'Wait until mounting is done ...'
while [[ -e ~/cavatica/mount_status ]]; do sleep 1; done
ls -l ~/cavatica/projects/

In [5]:
%sh
aws s3 ls s3://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/merged_snps/

In [6]:
%pyspark
#merged_all = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/merged_snps/*.parquet')
#print(merged_all.count())
#merged_all.show()

In [7]:
%pyspark
CHD_merged_genes_inner = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/CHD_merged_genes_inner.parquet')
print(CHD_merged_genes_inner.count())
CHD_merged_genes_inner.show()

In [8]:
%pyspark
CHD_merged_genes_inner = CHD_merged_genes_inner.drop_duplicates()
CHD_merged_genes_inner.count()

In [9]:
%pyspark
# load in gene symbol to hgnc code mappings
hgnc_master = pd.read_csv('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/hgnc_master.txt',sep='\t')
hgnc_master_merge_cols = hgnc_master[['hgnc_id','symbol']]

In [10]:
%pyspark
#CHD_merged_genes_inner.show()

gb = CHD_merged_genes_inner.groupBy('symbol').count().orderBy('count', ascending=False).toPandas()
gb

In [11]:
%pyspark
#merged_all = spark.read.parquet(f"s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/CHD_merged_genes.parquet")
#gb = merged_all.groupBy('node_id').count().orderBy('count', ascending=False).toPandas()


In [12]:
%pyspark
#merged_df[merged_df['ensembl_gene_id']!=None]
#merged_df['ensembl_gene_id'] =
#merged_df[[True if type(i)==str else False for i in merged_df['ensembl_gene_id'] ]]


In [13]:
%pyspark
#merged_all_genes = spark.read.parquet(f"s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/DataDistillery/CHD_merged_genes.parquet")


In [14]:
%pyspark
# format code nodes (for KFVARBINS nodes, which are connected to CHLO nodes)

#gb['node_id'] = ['CHLO ' + i.replace(':','.') for i in gb['node_id']]
#gb['KFVARBIN_CodeID'] = ['KFVARBIN '+i.replace(':','.').split(' ')[-1] for i in gb['node_id']]
#gb.head(5)


In [15]:
%pyspark
gb = pd.merge(gb,hgnc_master_merge_cols)

In [16]:
%pyspark
gb

In [17]:
%pyspark
# format code nodes for KFGENEBINS nodes, which are connected to HGNC nodes)

gb['node_id'] = [i.replace(':','-')+'-variant-count' for i in gb['symbol']]        #['CHLO ' + i.replace(':','.') for i in gb['node_id']]
gb['KFGENEBIN_CodeID'] = ['KFGENEBIN '+i for i in gb['node_id']]
gb.head(5)

In [18]:
%pyspark
# Every bin will show up at least once?
#gb = gb[gb['count'] > 1]
#print(len(gb))

In [19]:
%pyspark
nodes = gb[['KFGENEBIN_CodeID','count']].rename(columns={'KFGENEBIN_CodeID':'node_id','count':'value'})
nodes = fill_missing_cols(nodes)

In [20]:
%pyspark
#name_id_df = spark.table('studies').select(['kf_id','short_name','short_code'])
#name_id_df.toPandas().to_csv('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/DataDistillery/KF_study_id_name_mapping.txt',sep='\t',index=False)

In [21]:
%pyspark
gb['HGNC_CodeID'] = ['HGNC ' + i for i in gb['hgnc_id']]

In [22]:
%pyspark
# Define edges
gb['predicate'] = 'gene_has_variants'
edges = gb[['KFGENEBIN_CodeID','predicate','HGNC_CodeID']]
edges.columns = ['subject','predicate','object']


In [23]:
%pyspark


# Define relationships to cohort 
gb['KFCOHORT_CodeID'] = 'KFCOHORT SD-PREASA7S'  
gb['KFCOHORT_node_label'] = 'Kids First: Congenital Heart Defects'
gb['cohort_predicate'] = 'belongs_to_cohort'

edges_cohort = gb[['KFGENEBIN_CodeID','cohort_predicate','KFCOHORT_CodeID']]
edges_cohort.columns = ['subject','predicate','object']

edges_all = pd.concat([edges,edges_cohort])


In [24]:
%pyspark
from collections import Counter
Counter([i.split(' ')[0] for i in edges_all['subject']])

In [25]:
%pyspark
Counter([i.split(' ')[0] for i in edges_all['object']])

In [26]:
%pyspark
edges_all = edges_all.drop_duplicates().reset_index(drop=True)
edges_all.to_pickle('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/DataDistillery/OWLNETS_edgelist.pickle')

In [27]:
%pyspark



In [28]:
%pyspark

edges_all.to_csv('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/DataDistillery/OWLNETS_edgelist.txt',sep='\t',index=False)

In [29]:
%pyspark
nodes_cohort = fill_missing_cols(gb[['KFCOHORT_CodeID','KFCOHORT_node_label']].rename(columns={'KFCOHORT_CodeID':'node_id',
                                                                                                'KFCOHORT_node_label':'node_label'}).drop_duplicates())
nodes_cohort                                                                                                

In [30]:
%pyspark
nodes['value'] = nodes['value'].astype(int)

In [31]:
%pyspark
nodes_all = pd.concat([nodes,nodes_cohort]).drop_duplicates().reset_index(drop=True)
nodes_all = nodes_all.drop_duplicates().reset_index(drop=True)

In [32]:
%pyspark
Counter([i.split(' ')[0] for i in nodes_all['node_id']])

In [33]:
%pyspark

nodes_all.to_csv('~/cavatica/projects/taylordm/taylor-urbs-r03-kf-cardiac/DataDistillery/OWLNETS_node_metadata.txt',sep='\t',index=False)

In [34]:
%pyspark
nodes_all #.toPandas()

In [35]:
%pyspark
nodes_all