In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', None)
from collections import Counter
from glob import glob

In [None]:
# connect to CHLO
# GENCODE to CHLO

# Use case 1: Uploaded 3/16 to globus

In [None]:
'''
I’ve just uploaded a complete set of nodes and edges TSV files for our first 
proposed use case. Would someone be able to look at these files and let us know if 
everything looks fine?
 
For our use case we determined which genes overlap with the binding site coordinates of
specific RNA binding proteins. We are currently using ENSEMBL gene ids to represent these 
genes because we found the overlaps using GENCODE v43 gene annotations. We were previously 
planning to use HGNC ids but not all genes had an HGNC id available. When available, the HGNC id
belonging to a specific gene is listed in node_dbxrefs. Would you confirm this is fine?
'''

In [2]:
ERCC_path = '/Users/stearb/Desktop/DESKTOP_TRANSFER/DataDistilleryFiles/ERCC/'

In [2]:
nodes1 = pd.read_csv(ERCC_path+'useCase1/ERCC_UseCase1_nodes.tsv',sep='\t',header=0)

NameError: name 'ERCC_path' is not defined

In [118]:
nodes1[nodes1['node_id'].str.contains('/')]

Unnamed: 0,node_id,node_label,node_synonyms,node_dbxrefs
1149048,http://purl.obolibrary.org/obo/UBERON_0001359,,,
1149049,http://purl.obolibrary.org/obo/UBERON_0001969,,,
1149050,http://purl.obolibrary.org/obo/UBERON_0001836,,,
1149051,http://purl.obolibrary.org/obo/UBERON_0001977,,,
1149052,http://purl.obolibrary.org/obo/UBERON_0001088,,,


In [None]:
sab_splits = [i[0] for i in nodes1['node_id'].str.split(' ')]
Counter(sab_splits)

nodes1['sab'] = sab_splits
nodes1.head()

In [6]:
nodes1['sab'] = sab_splits
nodes1.head()

Unnamed: 0,node_id,node_label,node_synonyms,node_dbxrefs,sab
0,UNIPROTKB Q8N302,,,,UNIPROTKB
1,UNIPROTKB P35269,,,,UNIPROTKB
2,UNIPROTKB P14866,,,,UNIPROTKB
3,UNIPROTKB Q9NUL3,,,,UNIPROTKB
4,UNIPROTKB Q9H0D6,,,,UNIPROTKB


In [7]:
nodes1[nodes1['sab']=='ENCODE_RBS_150_NO_OVERLAP']

Unnamed: 0,node_id,node_label,node_synonyms,node_dbxrefs,sab
150,ENCODE_RBS_150_NO_OVERLAP chr1:12293606-12293614_plus_b38_FXR1,chr1:12293606-12293614_plus_b38_FXR1,,,ENCODE_RBS_150_NO_OVERLAP
151,ENCODE_RBS_150_NO_OVERLAP chr1:12318124-12318156_plus_b38_FXR1,chr1:12318124-12318156_plus_b38_FXR1,,,ENCODE_RBS_150_NO_OVERLAP
152,ENCODE_RBS_150_NO_OVERLAP chr1:12327804-12327855_plus_b38_FXR1,chr1:12327804-12327855_plus_b38_FXR1,,,ENCODE_RBS_150_NO_OVERLAP
153,ENCODE_RBS_150_NO_OVERLAP chr1:15929840-15929851_plus_b38_FXR1,chr1:15929840-15929851_plus_b38_FXR1,,,ENCODE_RBS_150_NO_OVERLAP
154,ENCODE_RBS_150_NO_OVERLAP chr1:15929909-15929912_plus_b38_FXR1,chr1:15929909-15929912_plus_b38_FXR1,,,ENCODE_RBS_150_NO_OVERLAP
...,...,...,...,...,...
458124,ENCODE_RBS_150_NO_OVERLAP chrX:70344076-70344164_plus_b38_CPSF6,chrX:70344076-70344164_plus_b38_CPSF6,,,ENCODE_RBS_150_NO_OVERLAP
458125,ENCODE_RBS_150_NO_OVERLAP chrX:77652194-77652196_minus_b38_CPSF6,chrX:77652194-77652196_minus_b38_CPSF6,,,ENCODE_RBS_150_NO_OVERLAP
458126,ENCODE_RBS_150_NO_OVERLAP chrX:123620548-123620627_minus_b38_CPSF6,chrX:123620548-123620627_minus_b38_CPSF6,,,ENCODE_RBS_150_NO_OVERLAP
458127,ENCODE_RBS_150_NO_OVERLAP chrX:124091782-124091795_plus_b38_CPSF6,chrX:124091782-124091795_plus_b38_CPSF6,,,ENCODE_RBS_150_NO_OVERLAP


## Edges from use case 1

In [12]:
edges1 = pd.read_csv(ERCC_path+'useCase1/ERCC_UseCase1_edges.tsv',sep='\t',header=0)

In [67]:
edges1 = edges1.rename(columns={'subject_id':'subject',
                               'relationship':'predicate',
                               'object_id':'object'})

# 5/2 email: These are “ERCC_UseCase2a_nodes.tsv.gz”, “ERCC_UseCase2a_edges.tsv.gz”, and “ERCC_UseCase2b.tar.gz”. The use case 2a files contain information for eQTLs and the use case 2b zip contains information for cCREs. Within the tar.gz archive there are 37 files. The file named “ERCC_UseCase2b_gene_tissue_nodes.tsv” should be ingested first. The others do not need to be ingested in a specific order.
 
### I also noticed that there were some nodes and edges missing from our prior submission. These are contained within the files “ERCC_UseCase1_missing_nodes.tsv.gz” and “ERCC_UseCase1_missing_edges.tsv.gz”. These should complete the exRNA portion of our submission.
 

# First Combine the missing nodes and missing edges from use case 1 with the main edges and nodes files from use case 1

In [16]:
usecase1_missing_nodes = pd.read_csv(ERCC_path+
                                     'useCase1_missing_nodes_edges/ERCC_UseCase1_missing_nodes.tsv',sep='\t')

In [13]:
def fill_missing_cols(df):
    
    if 'node_id' not in df.columns:
        raise ValueError
        
    all_cols = set([ 'node_label', 'node_synonyms', 'node_dbxrefs',
            'node_definition','node_namespace','value','lowerbound','upperbound','unit'])
   
    missing_cols = list(all_cols - set(df.columns))
    nan_cols_df = pd.DataFrame(np.full([len(df), len(missing_cols)], np.nan),columns=missing_cols)
    nan_cols_df.index = df.index
    return pd.concat([df,nan_cols_df],axis=1)

# remove -,.,: ?
# is ENSEMBL GENE_ID ok or should it be ENCODE GENE_ID?

In [123]:
#nan_cols_nodes1 = pd.DataFrame(np.full([len(nodes1), len(missing_cols)], np.nan),columns=missing_cols)
#nan_cols_missing_nodes1 = pd.DataFrame(np.full([len(usecase1_missing_nodes), 
#                    len(usecase1_missing_nodes_missing_cols)], np.nan),columns=usecase1_missing_nodes_missing_cols)


nodes1_full = fill_missing_cols(nodes1)
usecase1_missing_nodes_full = fill_missing_cols(usecase1_missing_nodes)

### Append (usecase1) `nodes` and `missing_nodes together`

In [124]:
# Reorder them both so the correct columns get concatenated on top of eachother
nodes_usecase1_final = pd.concat([
                                usecase1_missing_nodes_full[np.sort(usecase1_missing_nodes_full.columns)],
                                nodes1_full[np.sort(nodes1_full.columns)]
                                ],axis=0)

In [125]:
nodes_usecase1_final.to_csv(ERCC_path+'USECASE_1_CLEANED/OWLNETS_node_metadata.txt',index=False,sep='\t')

### Load in and append usecase1 missing edges to usecase1 edges file

In [126]:
usecase1_missing_edges = pd.read_csv(ERCC_path+
                                     'useCase1_missing_nodes_edges/ERCC_UseCase1_missing_edges.tsv',sep='\t')

usecase1_missing_edges = usecase1_missing_edges.rename(columns={'subject_id':'subject',
                                                               'relationship':'predicate',
                                                               'object_id':'object'})
usecase1_missing_edges

Unnamed: 0,subject,predicate,object
0,ENCODE_RBS_150_NO_OVERLAP chr1:234609218-234609225_minus_b38_ABCF1,http://purl.obolibrary.org/obo/RO_0002525,ENCODE_RBS_K562 chr1:234609152-234609225_minus_b38_ABCF1
1,ENCODE_RBS_150_NO_OVERLAP chr17:28722581-28722582_plus_b38_ABCF1,http://purl.obolibrary.org/obo/RO_0002525,ENCODE_RBS_K562 chr17:28722581-28722593_plus_b38_ABCF1
2,ENCODE_RBS_150_NO_OVERLAP chr17:82087399-82087437_minus_b38_ABCF1,http://purl.obolibrary.org/obo/RO_0002525,ENCODE_RBS_K562 chr17:82087399-82087437_minus_b38_ABCF1
3,ENCODE_RBS_150_NO_OVERLAP chr2:202276429-202276430_plus_b38_ABCF1,http://purl.obolibrary.org/obo/RO_0002525,ENCODE_RBS_K562 chr2:202276429-202276494_plus_b38_ABCF1
4,ENCODE_RBS_150_NO_OVERLAP chr5:648030-648037_plus_b38_ABCF1,http://purl.obolibrary.org/obo/RO_0002525,ENCODE_RBS_K562 chr5:647955-648037_plus_b38_ABCF1
...,...,...,...
9275,ENCODE_RBS_150_NO_OVERLAP chr1:173866875-173866876_minus_b38_WDR3,http://purl.obolibrary.org/obo/RO_0002131,ENSEMBL ENSG00000234741.10
9276,ENCODE_RBS_150_NO_OVERLAP chr17:19062036-19062043_plus_b38_WDR3,http://purl.obolibrary.org/obo/RO_0002131,ENSEMBL ENSG00000265185.6
9277,ENCODE_RBS_150_NO_OVERLAP chr2:132257703-132257734_minus_b38_WDR3,http://purl.obolibrary.org/obo/RO_0002131,ENSEMBL ENSG00000163046.16
9278,ENCODE_RBS_150_NO_OVERLAP chr20:26208702-26208706_minus_b38_WDR3,http://purl.obolibrary.org/obo/RO_0002131,ENSEMBL ENSG00000227195.11


In [127]:
# make sure col order is the same before append
edges_usecase1_final = pd.concat([edges1[['subject','predicate','object']],
                                    usecase1_missing_edges[['subject','predicate','object']]
                                ],axis=0)

edges_usecase1_final.to_csv(ERCC_path+'USECASE_1_CLEANED/OWLNETS_edgelist.txt',index=False,sep='\t')

In [37]:
# check why these edges didnt make it into the graph

# ENCODE_RBS_150_NO_OVERLAP	correlated_in	UBERON	22092	0	0.00
# ENCODE_RBS_150_NO_OVERLAP	not_correlated_in	UBERON	102	0	0.00


#nodes_usecase1 = pd.read_csv(ERCC_path+'USECASE_1_CLEANED/OWLNETS_node_metadata.txt',sep='\t')
edges_usecase1 = pd.read_csv(ERCC_path+'USECASE_1_CLEANED/OWLNETS_edgelist.txt',sep='\t')

In [54]:
edges_usecase1['object'] = [i.split('/')[-1] for i in edges_usecase1['object']]

In [57]:
edges_usecase1['object'] = [i.replace('_','/') for i if 'UBERON' in i else i in edges_usecase1['object']]

SyntaxError: cannot assign to conditional expression (2652948192.py, line 1)

In [40]:

#sab_splits = 
edges_usecase1['subject_sab'] = [i[0] for i in edges_usecase1['subject'].str.split(' ')]
edges_usecase1['object_sab'] = [i[0] for i in edges_usecase1['object'].str.split(' ')]


In [42]:
edges_usecase1[['subject_sab','predicate','object_sab']].drop_duplicates()

Unnamed: 0,subject_sab,predicate,object_sab
0,UNIPROTKB,http://purl.obolibrary.org/obo/RO_0002436,ENCODE_RBS_HepG2_K562
20016,UNIPROTKB,http://purl.obolibrary.org/obo/RO_0002436,ENCODE_RBS_K562
20362,UNIPROTKB,http://purl.obolibrary.org/obo/RO_0002436,ENCODE_RBS_HepG2
690919,ENCODE_RBS_150_NO_OVERLAP,http://purl.obolibrary.org/obo/RO_0002525,ENCODE_RBS_HepG2_K562
695915,ENCODE_RBS_150_NO_OVERLAP,http://purl.obolibrary.org/obo/RO_0002525,ENCODE_RBS_HepG2
697748,ENCODE_RBS_150_NO_OVERLAP,http://purl.obolibrary.org/obo/RO_0002525,ENCODE_RBS_K562
1149071,UNIPROTKB,predicted_in,http://purl.obolibrary.org/obo/UBERON_0001359
1149073,UNIPROTKB,not_predicted_in,http://purl.obolibrary.org/obo/UBERON_0001359
1149129,UNIPROTKB,predicted_in,http://purl.obolibrary.org/obo/UBERON_0001969
1149130,UNIPROTKB,not_predicted_in,http://purl.obolibrary.org/obo/UBERON_0001969


In [9]:
edges_usecase1[edges_usecase1['subject'].str.startswith('ENCODE_RBS_150_NO_OVERLAP') &\
              edges_usecase1['object'].str.startswith('ENCODE_RBS_HepG2_K562') ]['predicate'].unique()

array(['http://purl.obolibrary.org/obo/RO_0002525'], dtype=object)

# Look at use case 2a

In [27]:
# nodes
usecase2a_nodes = pd.read_csv(ERCC_path+
                                     'useCase2a/ERCC_UseCase2a_nodes.tsv',sep='\t')

In [28]:
nodes_usecase2a_final = fill_missing_cols(usecase2a_nodes)

In [29]:
sab_splits = [i[0] for i in nodes1['node_id'].str.split(' ')]
Counter(sab_splits)

nodes1['sab'] = sab_splits
nodes1.head()

Unnamed: 0,node_id,node_label,node_synonyms,node_dbxrefs,sab
0,UNIPROTKB Q8N302,,,,UNIPROTKB
1,UNIPROTKB P35269,,,,UNIPROTKB
2,UNIPROTKB P14866,,,,UNIPROTKB
3,UNIPROTKB Q9NUL3,,,,UNIPROTKB
4,UNIPROTKB Q9H0D6,,,,UNIPROTKB


In [31]:
#Counter(nodes1['sab'])

'UNIPROTKB': 150,
'ENCODE_RBS_150_NO_OVERLAP': 457979,
'ENCODE_RBS_HepG2_K562': 446999,
'ENCODE_RBS_K562': 101464,
'ENCODE_RBS_HepG2': 142456

SyntaxError: illegal target for annotation (1633364398.py, line 3)

In [130]:
nodes_usecase2a_final.to_csv(ERCC_path+'USECASE_2a_CLEANED/OWLNETS_node_metadata.txt',index=False,sep='\t')

# check eqtl overlap and check tissue overlap and check HGNC HGNC:1234 format

In [131]:
# edges
usecase2a_edges = pd.read_csv(ERCC_path+
                                     'useCase2a/ERCC_UseCase2a_edges.tsv',sep='\t')

usecase2a_edges_final = usecase2a_edges.rename(columns={'subject_id':'subject',
                                                               'relationship':'predicate',
                                                               'object_id':'object'})

usecase2a_edges_final

Unnamed: 0,subject,predicate,object
0,CLINGEN_ALLELE_REGISTRY CA7053557,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr13_111217690_T_C_b38_Esophagus_Muscularis
1,http://purl.obolibrary.org/obo/UBERON_0004648,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr13_111217690_T_C_b38_Esophagus_Muscularis
2,GTEX_EQTL eQTL_chr13_111217690_T_C_b38_Esophagus_Muscularis,http://purl.obolibrary.org/obo/RO_0002213,HGNC HGNC:15607
3,CLINGEN_ALLELE_REGISTRY CA14743376,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr19_16253903_G_A_b38_Artery_Aorta
4,http://purl.obolibrary.org/obo/UBERON_0001496,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr19_16253903_G_A_b38_Artery_Aorta
...,...,...,...
3024914,http://purl.obolibrary.org/obo/UBERON_0001496,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr15_89839388_C_T_b38_Artery_Aorta
3024915,GTEX_EQTL eQTL_chr15_89839388_C_T_b38_Artery_Aorta,http://purl.obolibrary.org/obo/RO_0002212,HGNC HGNC:571
3024916,CLINGEN_ALLELE_REGISTRY CA13206740,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr10_122560474_C_T_b38_Colon_Transverse
3024917,http://purl.obolibrary.org/obo/UBERON_0001157,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr10_122560474_C_T_b38_Colon_Transverse


In [132]:
usecase2a_edges_final.to_csv(ERCC_path+'USECASE_2a_CLEANED/OWLNETS_edgelist.txt',index=False,sep='\t')

In [36]:
usecase2a_edges_final = pd.read_csv(ERCC_path+'USECASE_2a_CLEANED/OWLNETS_edgelist.txt',sep='\t')
usecase2a_edges_final

Unnamed: 0,subject,predicate,object
0,CLINGEN_ALLELE_REGISTRY CA7053557,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr13_111217690_T_C_b38_Esophagus_Muscularis
1,http://purl.obolibrary.org/obo/UBERON_0004648,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr13_111217690_T_C_b38_Esophagus_Muscularis
2,GTEX_EQTL eQTL_chr13_111217690_T_C_b38_Esophagus_Muscularis,http://purl.obolibrary.org/obo/RO_0002213,HGNC HGNC:15607
3,CLINGEN_ALLELE_REGISTRY CA14743376,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr19_16253903_G_A_b38_Artery_Aorta
4,http://purl.obolibrary.org/obo/UBERON_0001496,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr19_16253903_G_A_b38_Artery_Aorta
...,...,...,...
3024914,http://purl.obolibrary.org/obo/UBERON_0001496,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr15_89839388_C_T_b38_Artery_Aorta
3024915,GTEX_EQTL eQTL_chr15_89839388_C_T_b38_Artery_Aorta,http://purl.obolibrary.org/obo/RO_0002212,HGNC HGNC:571
3024916,CLINGEN_ALLELE_REGISTRY CA13206740,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr10_122560474_C_T_b38_Colon_Transverse
3024917,http://purl.obolibrary.org/obo/UBERON_0001157,http://purl.obolibrary.org/obo/BFO_0000050,GTEX_EQTL eQTL_chr10_122560474_C_T_b38_Colon_Transverse


# Look at usecase 2b

In [11]:
# nodes

# load in and concatenate all sub-files for nodes and edges
# https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
usecase2b_node_paths = glob(ERCC_path+'useCase2b/ERCC_UseCase2b/*nodes*')
usecase2b_edge_paths = glob(ERCC_path+'useCase2b/ERCC_UseCase2b/*edges*')

def f(i):
    return pd.read_csv(i, sep='\t')

usecase2b_nodes = pd.concat(map(f, usecase2b_node_paths))  # ~3mil

In [121]:
sab_splits = [i[0] for i in usecase2b_nodes['node_id'].str.split(' ')]
usecase2b_nodes['sab'] = sab_splits
usecase2b_nodes.head()
Counter(sab_splits)

Counter({'SCREEN': 3258606,
         'HGNC': 7998,
         'http://www.ebi.ac.uk/efo/EFO_0001086': 1,
         'http://www.ebi.ac.uk/efo/EFO_0002106': 1,
         'http://purl.obolibrary.org/obo/CLO_0034837': 1,
         'http://www.ebi.ac.uk/efo/EFO_0006270': 1,
         'http://purl.obolibrary.org/obo/CLO_0021752': 1,
         'http://purl.obolibrary.org/obo/CLO_0021754': 1,
         'http://purl.obolibrary.org/obo/CLO_0021511': 1,
         'http://www.ebi.ac.uk/efo/EFO_0005725': 1,
         'http://www.ebi.ac.uk/efo/EFO_0002779': 1,
         'http://purl.obolibrary.org/obo/CL_0000236': 1,
         'http://purl.obolibrary.org/obo/CL_0001054': 1,
         'http://purl.obolibrary.org/obo/CL_0000624': 1,
         'http://purl.obolibrary.org/obo/CL_0000897': 1,
         'http://purl.obolibrary.org/obo/CL_0000895': 1,
         'http://purl.obolibrary.org/obo/CL_0000625': 1,
         'http://www.ebi.ac.uk/efo/EFO_0001099': 1,
         'http://www.ebi.ac.uk/efo/EFO_0007074': 1,
         'h

In [14]:
usecase2b_nodes = fill_missing_cols(usecase2b_nodes)

In [16]:
# edges
usecase2b_edges = pd.concat(map(f,usecase2b_edge_paths))   # ~9mil

usecase2b_edges = usecase2b_edges.rename(columns={'subject_id':'subject',
                               'relationship':'predicate',
                               'object_id':'object'})

### Replace `_` with `.` in SCREEN nodes and edges files

In [49]:
#usecase2b_nodes['node_id'] = [i.replace('_','.') if 'SCREEN' == i[:6]  else i for i in usecase2b_nodes['node_id']]

usecase2b_edges['subject'] = [i.replace('_','.') if 'SCREEN' == i[:6]  else i for i in usecase2b_edges['subject']]
usecase2b_edges['object'] = [i.replace('_','.') if 'SCREEN' == i[:6]  else i for i in usecase2b_edges['object']]

In [50]:
usecase2b_nodes.to_csv(ERCC_path+'USECASE_2B_CLEANED/OWLNETS_node_metadata.txt',index=False,sep='\t')

In [51]:
usecase2b_edges.to_csv(ERCC_path+'USECASE_2B_CLEANED/OWLNETS_edgelist.txt',index=False,sep='\t')

In [44]:
[i for i in usecase2b_nodes['node_id'] if '/' in i]

['http://www.ebi.ac.uk/efo/EFO_0001086',
 'http://www.ebi.ac.uk/efo/EFO_0002106',
 'http://purl.obolibrary.org/obo/CLO_0034837',
 'http://www.ebi.ac.uk/efo/EFO_0006270',
 'http://purl.obolibrary.org/obo/CLO_0021752',
 'http://purl.obolibrary.org/obo/CLO_0021754',
 'http://purl.obolibrary.org/obo/CLO_0021511',
 'http://www.ebi.ac.uk/efo/EFO_0005725',
 'http://www.ebi.ac.uk/efo/EFO_0002779',
 'http://purl.obolibrary.org/obo/CL_0000236',
 'http://purl.obolibrary.org/obo/CL_0001054',
 'http://purl.obolibrary.org/obo/CL_0000624',
 'http://purl.obolibrary.org/obo/CL_0000897',
 'http://purl.obolibrary.org/obo/CL_0000895',
 'http://purl.obolibrary.org/obo/CL_0000625',
 'http://www.ebi.ac.uk/efo/EFO_0001099',
 'http://www.ebi.ac.uk/efo/EFO_0007074',
 'http://www.ebi.ac.uk/efo/EFO_0002783',
 'http://www.ebi.ac.uk/efo/EFO_0005337',
 'http://www.ebi.ac.uk/efo/EFO_0005338',
 'http://www.ebi.ac.uk/efo/EFO_0002784',
 'http://www.ebi.ac.uk/efo/EFO_0005723',
 'http://www.ebi.ac.uk/efo/EFO_0007950',
 'h

In [41]:
usecase2b_nodes = pd.read_csv(ERCC_path+'USECASE_2B_CLEANED/OWLNETS_node_metadata.txt',sep='\t')
usecase2b_nodes

  usecase2b_nodes = pd.read_csv(ERCC_path+'USECASE_2B_CLEANED/OWLNETS_node_metadata.txt',sep='\t')


Unnamed: 0,node_id,node_label,node_definition,node_synonyms,node_namespace,node_dbxrefs,upperbound,lowerbound,unit,value
0,SCREEN EH38E2865685,EH38E2865685,cCRE,,,,,,,
1,SCREEN EH38E2865685_EFO0009318,EH38E2865685_HFFc6,cCRE within specific biosample type,,,,,,,
2,SCREEN EH38E2865686,EH38E2865686,cCRE,,,,,,,
3,SCREEN EH38E2865686_EFO0001196,EH38E2865686_IMR-90,cCRE within specific biosample type,,,,,,,
4,SCREEN EH38E2865688,EH38E2865688,cCRE,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
3266728,SCREEN EH38E2769434_UBERON0000059,EH38E2769434_large_intestine,cCRE within specific biosample type,,,,,,,
3266729,SCREEN EH38E2769434_EFO0007096,EH38E2769434_iPS_DF_19.11,cCRE within specific biosample type,,,,,,,
3266730,SCREEN EH38E2769434_EFO0005723,EH38E2769434_GM23248,cCRE within specific biosample type,,,,,,,
3266731,SCREEN EH38E2769434_EFO0002074,EH38E2769434_PC-3,cCRE within specific biosample type,,,,,,,


In [35]:
usecase2b_nodes[usecase2b_nodes['node_id'].str.contains('UBERON0000059')]

Unnamed: 0,node_id,node_label,node_definition,node_synonyms,node_namespace,node_dbxrefs,upperbound,lowerbound,unit,value
53,SCREEN EH38E2866116_UBERON0000059,EH38E2866116_large_intestine,cCRE within specific biosample type,,,,,,,
69,SCREEN EH38E2866117_UBERON0000059,EH38E2866117_large_intestine,cCRE within specific biosample type,,,,,,,
90,SCREEN EH38E2866143_UBERON0000059,EH38E2866143_large_intestine,cCRE within specific biosample type,,,,,,,
135,SCREEN EH38E2866174_UBERON0000059,EH38E2866174_large_intestine,cCRE within specific biosample type,,,,,,,
175,SCREEN EH38E2866175_UBERON0000059,EH38E2866175_large_intestine,cCRE within specific biosample type,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
3266420,SCREEN EH38E2768352_UBERON0000059,EH38E2768352_large_intestine,cCRE within specific biosample type,,,,,,,
3266499,SCREEN EH38E2768526_UBERON0000059,EH38E2768526_large_intestine,cCRE within specific biosample type,,,,,,,
3266508,SCREEN EH38E2768529_UBERON0000059,EH38E2768529_large_intestine,cCRE within specific biosample type,,,,,,,
3266623,SCREEN EH38E2769187_UBERON0000059,EH38E2769187_large_intestine,cCRE within specific biosample type,,,,,,,


In [48]:
usecase2b_edges = pd.read_csv(ERCC_path+'USECASE_2B_CLEANED/OWLNETS_edgelist.txt',sep='\t')
usecase2b_edges.head(30)

Unnamed: 0,subject,predicate,object
0,SCREEN EH38E3846211,http://purl.obolibrary.org/obo/BFO_0000050,SCREEN EH38E3846211_EFO0002106
1,http://www.ebi.ac.uk/efo/EFO_0002106,http://purl.obolibrary.org/obo/BFO_0000050,SCREEN EH38E3846211_EFO0002106
2,SCREEN EH38E3846211_EFO0002106,http://purl.obolibrary.org/obo/RO_0002211,HGNC HGNC:4221
3,SCREEN EH38E3846211,http://purl.obolibrary.org/obo/BFO_0000050,SCREEN EH38E3846211_CL1001608
4,http://purl.obolibrary.org/obo/CL_1001608,http://purl.obolibrary.org/obo/BFO_0000050,SCREEN EH38E3846211_CL1001608
5,SCREEN EH38E3846211_CL1001608,http://purl.obolibrary.org/obo/RO_0002211,HGNC HGNC:4221
6,SCREEN EH38E3846212,http://purl.obolibrary.org/obo/BFO_0000050,SCREEN EH38E3846212_EFO0002106
7,http://www.ebi.ac.uk/efo/EFO_0002106,http://purl.obolibrary.org/obo/BFO_0000050,SCREEN EH38E3846212_EFO0002106
8,SCREEN EH38E3846212_EFO0002106,http://purl.obolibrary.org/obo/RO_0002211,HGNC HGNC:4221
9,SCREEN EH38E3846212,http://purl.obolibrary.org/obo/BFO_0000050,SCREEN EH38E3846212_CL1001608


In [38]:
codes = pd.read_csv('/Users/stearb/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-7fcafad9-8b40-4086-b2b1-143ea67fc66c/import/CODEs.csv')
codes

  codes = pd.read_csv('/Users/stearb/Library/Application Support/Neo4j Desktop/Application/relate-data/dbmss/dbms-7fcafad9-8b40-4086-b2b1-143ea67fc66c/import/CODEs.csv')


Unnamed: 0,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
0,MSH D015056,MSH,D015056,,,,
1,SNOMEDCT_US 285407008,SNOMEDCT_US,285407008,,,,
2,LNC LP17185-7,LNC,LP17185-7,,,,
3,CHV 0000000504,CHV,0000000504,,,,
4,GO GO:0004113,GO,GO:0004113,,,,
...,...,...,...,...,...,...,...
19304313,SCREEN EH38E2769434.UBERON0000059,SCREEN,EH38E2769434.UBERON0000059,,,,
19304314,SCREEN EH38E2769434.EFO0007096,SCREEN,EH38E2769434.EFO0007096,,,,
19304315,SCREEN EH38E2769434.EFO0005723,SCREEN,EH38E2769434.EFO0005723,,,,
19304316,SCREEN EH38E2769434.EFO0002074,SCREEN,EH38E2769434.EFO0002074,,,,


In [40]:
codes[codes['SAB']=='SCREEN']

Unnamed: 0,CodeID:ID,SAB,CODE,value:float,lowerbound:float,upperbound:float,unit
11088072,SCREEN EH38E2865685,SCREEN,EH38E2865685,,,,
11088073,SCREEN EH38E2865685_EFO0009318,SCREEN,EH38E2865685_EFO0009318,,,,
11088074,SCREEN EH38E2865686,SCREEN,EH38E2865686,,,,
11088075,SCREEN EH38E2865686_EFO0001196,SCREEN,EH38E2865686_EFO0001196,,,,
11088076,SCREEN EH38E2865688,SCREEN,EH38E2865688,,,,
...,...,...,...,...,...,...,...
19304313,SCREEN EH38E2769434.UBERON0000059,SCREEN,EH38E2769434.UBERON0000059,,,,
19304314,SCREEN EH38E2769434.EFO0007096,SCREEN,EH38E2769434.EFO0007096,,,,
19304315,SCREEN EH38E2769434.EFO0005723,SCREEN,EH38E2769434.EFO0005723,,,,
19304316,SCREEN EH38E2769434.EFO0002074,SCREEN,EH38E2769434.EFO0002074,,,,
