# Aggregate data files by node or relationship type
Load in the files from each step and combine them based on node or relationship type. This way we dont need to write a cypher import query for CUI's for every step, we can just load all CUIs with one query.  
For example, combine all the CUIs.csv files into one CUI_MASTER file.  
Then, when everything looks good, we can add these master files directly to the UMLS-KG csv files

### Our data that we are incorporating into UMLS connects to the HGNC, HPO, UBERON ontologies/vocabs only.

In [1]:
import pandas as pd
import numpy as np
import os
#from matplotlib_venn import venn2, venn3
#import matplotlib.pyplot as plt
from collections import Counter 
from umls_utils import get_paths
import time

In [59]:
#!python3 -m pip install "dask[complete]"  

In [20]:
#import dask
umls_dir

'/Users/stearb/Desktop/hubmap-kg/FREEZE/'

In [19]:
!jupyter nbconvert --to script aggregate_files-Copy1.ipynb
!sed -i '' '/.head(/d' aggregate_files-Copy1.py
!sed -i '' '/^#/d' aggregate_files-Copy1.py
!sed -i '' '/get_ipython()/d' aggregate_files-Copy1.py
#!sed -i '' '/print/d' aggregate_files-Copy1.py

[NbConvertApp] Converting notebook aggregate_files-Copy1.ipynb to script
[NbConvertApp] Writing 32099 bytes to aggregate_files-Copy1.py


In [2]:
data_dir,helper_data_dir,output_dir,LOCAL_CPU,umls_dir, umls_out_dir = get_paths('/Users/stearb/Dropbox/CHOP/R03/code/neo4j_build_CFDIKG/build_scripts/')

# We will import our data from the folder where we saved the CSVs from each step.
import_dir = output_dir
 
if not os.path.isdir(output_dir+'new_UMLS_CSVs'):
    os.mkdir(output_dir+'new_UMLS_CSVs')
    print('Creating new_UMLS_CSVs directory...')

new_UMLS_CSVs_path = output_dir+'new_UMLS_CSVs'+'/'


# Aggregate the file types across each of the different steps
Put all CUIs.csv into one CUIs.csv file  
Put all CUI-CODEs.csv into one CUI-CODEs.csv file  
Etc.,



## Combine and Load CUI files

In [6]:
t0 = time.time()

# Load mouse ortholog CUIs
CUI_ortho = pd.read_pickle(output_dir+'orthologs/CUI_mouse_ortho.pickle')

# Load mouse phenotype CUIs
CUI_genopheno = pd.read_pickle(output_dir+'genopheno/CUIs_genotype.pickle')

# Load mammalian phenotype ontology CUIs
CUI_mp = pd.read_pickle(output_dir+'MPO/CUIs_mp_ont.pickle') 

# Load phenomapping CUIs 
CUI_phenomap = pd.read_csv(output_dir+'hpo_mp_mapping/CUIs_phenomapping.csv')

#  GTEx CUIs (eQTLs and median gene expression)   
CUI_gtex = pd.read_pickle(output_dir+'GTEx/CUIs_GTEx.pickle')

# dbSNP CUIs
#CUI_dbsnp = pd.read_csv(LOCAL_PATH+'dbsnp/CUIs_dbsnp.csv')

# KF phenotype
CUI_kf = pd.read_csv(output_dir+'KF_phenotypes/CUIs_kf.csv')

# scHeart
CUI_scHeart = pd.read_csv(output_dir+'scHeart/CUIs_scHeart.csv')

# Hubmap 
CUI_hubmap = pd.read_pickle(output_dir+'HUBMAPsc/hubmap_CUIs.pickle')


CUIs_all = pd.DataFrame(np.concatenate([CUI_ortho.values,CUI_genopheno ,CUI_mp  # CUI_dbsnp,
                             ,CUI_phenomap,CUI_gtex,CUI_kf,CUI_scHeart,CUI_hubmap]),columns=['CUI:ID']).drop_duplicates()


#CUIs_all = pd.Series([i[0] for i in CUIs_all.values],name='CUI:ID')
# Check that all CUIs are in the same format (same length)
#assert np.all(CUIs_all['CUI:ID'].str.len() == 16) CUIs will be different lengths using the base64 method

# Check for no collisions
assert CUIs_all.nunique()['CUI:ID'] == len(CUIs_all)

end = time.time() - t0
print(f'All CUI files took {np.floor(end/60)} min. {np.round(end%60)} seconds.')

All CUI files took 0.0 min. 38.0 seconds.


## Load and combine all CUI-CUI files, need ':TYPE',  'SAB' columns to match UMLS CUI-CUI import files
Remember the  relationship :type is just the Code-Code relationship types but in the Concept (CUI) space.  
So our relationships will be:  

#### CUI-CUI relationship :TYPEs by file
orthologs/CUI-CUI_ortho.csv  -----> :ortholog

genopheno/CUI-CUI_genotype.csv ----> :has_phenotype

MPO/CUI-CUIs_mp_ont.csv ------> :SCO     #### FIX

hpo_mp_mapping/CUI-CUI_phenomapping.csv   ------>  :has_human_phenotype,:has_mouse_phenotype,

GTEx/CUI-CUI_GTEx.csv 
-----> GTEX EXP :has_median_expression_in_gene,  :has_median_expression_in_tissue  
-----> GTEX EQTL:has_eqtl :in_gene, :in_tissue, :in_variant

#### CUI-CUI SAB by file

In [None]:
'hpo_mp_mapping/CUI_CUI_phenomapping.csv',,'kf_phenotypes/CUI_CUIs_kf.csv',
                'scHeart/CUI_CUIs_scHeart.csv'

In [3]:
cui_cui_paths = ['orthologs/CUI_CUI_ortho.pickle','genopheno/CUI_CUI_genotype.pickle',
                 'MPO/CUI_CUIs_mp_ont.pickle',
                 'GTEx/CUI_CUI_GTEx.pickle','HGNC_HPO/CUI_CUIs_hgnc_hpo.pickle',
                 'HUBMAPsc/hubmap_CUI_CUIs.pickle',
                'hpo_mp_mapping/CUI_CUI_phenomapping.csv','kf_phenotypes/CUI_CUIs_kf.csv',
                'scHeart/CUI_CUIs_scHeart.csv']

cui_cui_paths_pickle = [output_dir+i for i in cui_cui_paths]
cui_cui_paths_csv = [i.replace('pickle','csv') for i in cui_cui_paths_pickle]

In [5]:
t0 = time.time()

# Load mouse ortholog CUIs
CUI_CUI_ortho = pd.read_pickle(output_dir+'orthologs/CUI_CUI_ortho.pickle')

# Load mouse phenotype CUIs
CUI_CUI_genopheno = pd.read_pickle(output_dir+'genopheno/CUI_CUI_genotype.pickle')

# Load mammalian phenotype ontology CUIs     
CUI_CUI_mp = pd.read_pickle(output_dir+'MPO/CUI_CUIs_mp_ont.pickle') # All :TYPEs are 'SCO' still...

# Load phenomapping CUIs 
CUI_CUI_phenomap = pd.read_csv(output_dir+'hpo_mp_mapping/CUI_CUI_phenomapping.csv')

#  GTEx CUIs (eQTLs and med. expression)
CUI_CUI_gtex = pd.read_pickle(output_dir+'GTEx/CUI_CUI_GTEx.pickle')

# dbSNP CUI-CUIs, this is the actual dbSNP data, we have dbSNP rs IDs from GTEx already
#CUI_CUI_dbsnp = pd.read_csv(LOCAL_PATH+'dbsnp/CUI_CUIs_dbsnp.csv')

# KF phenotypes
CUI_CUI_kf = pd.read_csv(output_dir+'kf_phenotypes/CUI_CUIs_kf.csv')

# scHeart
CUI_CUI_scHeart = pd.read_csv(output_dir+'scHeart/CUI_CUIs_scHeart.csv')

# HGNC-HPO 
CUI_CUI_hgnc_hpo = pd.read_pickle(output_dir+'HGNC_HPO/CUI_CUIs_hgnc_hpo.pickle')

# Hubmap

CUI_CUI_hubmap = pd.read_pickle(output_dir+'HUBMAPsc/hubmap_CUI_CUIs.pickle')

end = time.time() - t0
print(f'Reading CUI-CUI files took {np.floor(end/60)} min. {np.round(end%60)} seconds.')

t0 = time.time()

np_concat = np.concatenate([CUI_CUI_ortho.values,
                                    CUI_CUI_genopheno,
                                    CUI_CUI_mp,
                                    CUI_CUI_phenomap,
                                    CUI_CUI_gtex, 
                                    CUI_CUI_kf, 
                                    CUI_CUI_scHeart,
                                    CUI_CUI_hgnc_hpo, # CUI_CUI_dbsnp
                                    CUI_CUI_hubmap
                                    ])
end = time.time() - t0
print(f'Concat CUI-CUI files took {np.floor(end/60)} min. {np.round(end%60)} seconds.')


CUI_CUIs_all = pd.DataFrame(np_concat,columns=[':START_ID',':END_ID',':TYPE','SAB']).drop_duplicates()


'''
CUI_CUIs_all = pd.DataFrame(np.concatenate([CUI_CUI_ortho.values,
                                            CUI_CUI_genopheno,
                                            CUI_CUI_mp,
                                            CUI_CUI_phenomap,
                                            CUI_CUI_gtex, 
                                            CUI_CUI_kf, 
                                            CUI_CUI_scHeart,
                                            CUI_CUI_hgnc_hpo, # CUI_CUI_dbsnp
                                            CUI_CUI_hubmap
                                            ]),
                                                columns=[':START_ID',':END_ID',':TYPE','SAB']).drop_duplicates()
'''

end = time.time() - t0
print(f'All CUI-CUI files took {np.floor(end/60)} min. {np.round(end%60)} seconds.')

Reading CUI-CUI files took 0.0 min. 39.0 seconds.
Concat CUI-CUI files took 0.0 min. 31.0 seconds.
All CUI-CUI files took 2.0 min. 33.0 seconds.


In [12]:
%%time
Output = pd.concat([pd.read_pickle(x) for x in cui_cui_paths_pickle])
type(Output)

CPU times: user 15.5 s, sys: 11.6 s, total: 27 s
Wall time: 32.3 s


In [21]:
!head /Users/stearb/Desktop/R03_local/data/use_config/OUTPUT_FILES/test_concat.csv

B  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  jB  j B  j!B  j"B  j#B  j$B  j%B  j&B  j'B  j(B  j)B  j*B  j+B  j,B  j-B  j.B  j/B  j0B  j1B  j2B  j3B  j4B  j5B  j6B  j7B  j8B  j9B  j:B  j;B  j<B  j=B  j>B  j?B  j@B  jAB  jBB  jCB  jDB  jEB  jFB  jGB  jHB  jIB  jJB  jKB  jLB  jMB  jNB  jOB  jPB  jQB  jRB  jSB  jTB  jUB  jVB  jWB  jXB  jYB  jZB  j[B  j\B  j]B  j^B  j_B  j`B  jaB  jbB  jcB  jdB  jeB  jfB  jgB  jhB  jiB  jjB  jkB  jlB  jmB  jnB  joB  jpB  jqB  jrB  jsB  jtB  juB  jvB  jwB  jxB  jyB  jzB  j{B  j|B  j}B  j~B  jB  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�B  j�

In [27]:
#f = pd.read_pickle('/Users/stearb/Desktop/R03_local/data/use_config/OUTPUT_FILES/HUBMAPsc/hubmap_CUI_CUIs.pickle')
#f.to_csv('/Users/stearb/Desktop/R03_local/data/use_config/OUTPUT_FILES/HUBMAPsc/hubmap_CUI_CUIs.csv',index=False)
#pd.read_csv('/Users/stearb/Desktop/R03_local/data/use_config/OUTPUT_FILES/HUBMAPsc/hubmap_CUI_CUIs.csv')

In [None]:
%%time
# pickle version
with open(output_dir+'test_concat.pickle',"wb") as fout:
    # first file:
    with open(cui_cui_paths[0], "rb") as f:
        fout.write(f.read())
    # now the rest:    
    for FILE in cui_cui_paths[2:]:
        with open(FILE, "rb") as f:
            next(f) # skip the header
            fout.write(f.read())

In [4]:
%%time
# csv version

with open(output_dir+'test_concat.csv',"wb") as fout:
    # first file:
    with open(cui_cui_paths_csv[0], "rb") as f:
        fout.write(f.read())
    # now the rest:    
    for FILE in cui_cui_paths_csv[1:]:
        with open(FILE, "rb") as f:
            next(f) # skip the header
            fout.write(f.read())

CPU times: user 4.23 s, sys: 34 s, total: 38.2 s
Wall time: 1min 7s


In [20]:
p = pd.read_csv(output_dir+'test_concat.csv')
p

In [21]:
p

Unnamed: 0,:START_ID,:END_ID,:TYPE,SAB
0,C0694879,SENPUCBIQ09QOkV4dDI=,has_mouse_ortholog,HGNC__HGNC_HCOP
1,C1332096,SENPUCBIQ09QOkFueGE4,has_mouse_ortholog,HGNC__HGNC_HCOP
2,C1332123,SENPUCBIQ09QOkF2cHIxYg==,has_mouse_ortholog,HGNC__HGNC_HCOP
3,C1332682,SENPUCBIQ09QOkNjbDE5,has_mouse_ortholog,HGNC__HGNC_HCOP
4,C1332682,SENPUCBIQ09QOkdtMjU2NA==,has_mouse_ortholog,HGNC__HGNC_HCOP
...,...,...,...,...
16739089,YXV0aG9yX2RlZmluZWRfY2x1c3RlcjpGaWJyb2JsYXN0LW...,c2NIZWFydCBQTUlEOiAzMTgzNTAzNyBGaWJyb2JsYXN0LW...,has_single_cell_expression,scHeart__cellType
16739090,YXV0aG9yX2RlZmluZWRfY2x1c3RlcjpGaWJyb2JsYXN0LW...,c2NIZWFydCBQTUlEOiAzMTgzNTAzNyBGaWJyb2JsYXN0LW...,has_single_cell_expression,scHeart__cellType
16739091,YXV0aG9yX2RlZmluZWRfY2x1c3RlcjpGaWJyb2JsYXN0LW...,c2NIZWFydCBQTUlEOiAzMTgzNTAzNyBGaWJyb2JsYXN0LW...,has_single_cell_expression,scHeart__cellType
16739092,YXV0aG9yX2RlZmluZWRfY2x1c3RlcjpGaWJyb2JsYXN0LW...,c2NIZWFydCBQTUlEOiAzMTgzNTAzNyBGaWJyb2JsYXN0LW...,has_single_cell_expression,scHeart__cellType


In [28]:
%%time
np_concat = np.concatenate([CUI_CUI_ortho.values,
                                    CUI_CUI_genopheno,
                                    CUI_CUI_mp,
                                    CUI_CUI_phenomap,
                                    CUI_CUI_gtex, 
                                    CUI_CUI_kf, 
                                    CUI_CUI_scHeart,
                                    CUI_CUI_hgnc_hpo, # CUI_CUI_dbsnp
                                    CUI_CUI_hubmap
                                    ])

CPU times: user 2.91 s, sys: 1.11 s, total: 4.02 s
Wall time: 4.1 s


In [29]:
%%time
df_np_concat = pd.DataFrame(np_concat,columns=[':START_ID',':END_ID',':TYPE','SAB']).drop_duplicates()

CPU times: user 4.7 s, sys: 1.44 s, total: 6.14 s
Wall time: 6.62 s


In [30]:
%%time

CUI_CUIs_all = pd.DataFrame(np.concatenate([CUI_CUI_ortho.values,
                                            CUI_CUI_genopheno,
                                            CUI_CUI_mp,
                                            CUI_CUI_phenomap,
                                            CUI_CUI_gtex, 
                                            CUI_CUI_kf, 
                                            CUI_CUI_scHeart,
                                            CUI_CUI_hgnc_hpo, # CUI_CUI_dbsnp
                                            CUI_CUI_hubmap
                                            ]),
                                                columns=[':START_ID',':END_ID',':TYPE','SAB']).drop_duplicates()

CPU times: user 1min 23s, sys: 13 s, total: 1min 36s
Wall time: 1min 39s


### Why HGNC HCOP and HGNC HCOP//MP ??

In [27]:
Counter(CUI_CUIs_all['SAB']) # dont change inverse 

Counter({'HGNC__HGNC_HCOP': 132312,
         'IMPC': 510680,
         'MP': 33294,
         'HPO__MP': 2428,
         'GTEX_EXP__HGNC': 3947664,
         'GTEX_EXP__UBERON': 3947664,
         'GTEX_EQTL__HGNC': 1762790,
         'GTEX_EQTL__UBERON': 2321876,
         'GTEX_EQTL__DBSNP_151': 2321876,
         'KF_HPO': 27388,
         'scHeart__HGNC': 5632,
         'scHeart__cellType': 5632,
         'HGNC__HPO': 1717940,
         'HUBMAP__HGNC': 21863490,
         'HUBMAP__CLUSTER': 21863490,
         'HUBMAP_DATASET__TISSUE': 934,
         'HUBMAP_DONOR__HUBMAP_DATSET': 126,
         'HUBMAP_CLUSTER__HUBMAP_DATASET': 1792,
         'HUBMAP_DATASET__SEX': 126,
         'HUBMAP_DATASET__RACE': 130,
         'HUBMAP_DATASET__BMI': 126,
         'HUBMAP_DATASET__BLOOD_TYPE': 82,
         'HUBMAP_DATASET__CAUSE_OF_DEATH': 108,
         'HUBMAP_DATASET__MECHANISM_OF_INJURY': 100,
         'HUBMAP_DATASET__DEATH_EVENT': 100,
         'HUBMAP_DATASET__MEDICAL_HISTORY': 128,
         'HUBMAP_

________

## Load and combine all CUI-CODEs files  (its actually CUI-CodeID)
Cols = 'CUI', 'CODE'

In [5]:
# chnage thename of these to CUI_CODEs
t0 = time.time()

#Load mouse ortholog CODEs
CUI_CODEs_ortho = pd.read_pickle(output_dir+'orthologs/CUI_CODE_ortho.pickle')

# Load mouse phenotype CUI-CODEs
CUI_CODEs_genopheno = pd.read_pickle(output_dir+'genopheno/CUI_CODE_genotype.pickle')

# Load mammalian phenotype ontology CUI-CODEs
CUI_CODEs_mp = pd.read_pickle(output_dir+'MPO/CUI_CODEs_mp_ont.pickle')  

# Load phenomapping CUI-CODEs 
CUI_CODEs_phenomap = pd.read_csv(output_dir+'hpo_mp_mapping/CUI_CODEs_phenomapping.csv')

#  GTEx CUI CODEs (eQTLs and median gene expression)  
CUI_CODEs_gtex = pd.read_pickle(output_dir+'GTEx/CUI_CODEs_GTEx.pickle')

# dbSNP CUI-CODEs
#CUI_CODEs_dbsnp = pd.read_csv(LOCAL_PATH+'dbsnp/CUI_CODEs_dbsnp.csv')

# hgncAnnos CUI-CODEs, (HGNC CUIs-GL_CODEs)
CUI_CODEs_hgncAnnos =  pd.read_csv(output_dir+'hgnc_annos/CUI_CODEs_hgncAnno.csv')

# kf phenotypes CUI-CODES
CUI_CODES_kf = pd.read_csv(output_dir+'kf_phenotypes/CUI_CODEs_kf.csv')

# scHeart CUI-CODES
CUI_CODES_scHeart = pd.read_csv(output_dir+'scHeart/CUI_CODEs_scHeart.csv')

# Hubmap 
CUI_CODEs_hubmap = pd.read_pickle(output_dir+'HUBMAPsc/hubmap_CUIs_CODEs.pickle')


# The only CUI-CODEs file that doesnt have the same number of unique values in each column (aka a 1 to 1 mapping) 
# is the CODEs_phenomap file because it contains HPO CUI-Code mappings from  UMLS which are not always 1 to 1, meaning
# some HPO Concepts nodes map to more than one HPO Code nodes.

#  Select the CUI and CodeID columns
CUI_CODEs_all = pd.DataFrame(np.concatenate([CUI_CODEs_ortho.values, 
                                         CUI_CODEs_mp.values, 
                                         CUI_CODEs_gtex.values,
                                         CUI_CODEs_genopheno.values,
                                         CUI_CODEs_phenomap.values, 
                                         CUI_CODEs_hgncAnnos.values,
                                         CUI_CODES_kf.values,
                                         CUI_CODES_scHeart.values,  #   CUI_CODEs_dbsnp.values
                                         CUI_CODEs_hubmap.values
                                       ]), columns=[':START_ID',':END_ID']).drop_duplicates()

print('CUI-Codes took',(time.time()-t0)/60,'minutes')

CUI-Codes took 0.8091420491536458 minutes


## Combine all CODEs into one file.
cols: CodeID, SAB, CODE

In [6]:
t0 = time.time()


#Load mouse ortholog CODEs
CODEs_ortho = pd.read_pickle(output_dir+'orthologs/CODE_mouse_ortho.pickle')

# Load mouse phenotype CODEs
CODEs_genopheno = pd.read_pickle(output_dir+'genopheno/CODEs_genotype.pickle')

# Load mammalian phenotype ontology CODEs
CODEs_mp = pd.read_pickle(output_dir+'MPO/CODEs_mp_ont.pickle')    

# Load phenomapping CODEs 
CODEs_phenomap = pd.read_csv(output_dir+'hpo_mp_mapping/CODEs_phenomapping.csv')

# Load GTEx (eqtl and med. gene CODEs)
CODEs_gtex = pd.read_pickle(output_dir+'GTEx/CODEs_GTEx.pickle')   

# dbSNP CODEs
#CODEs_dbsnp = pd.read_csv(LOCAL_PATH+'dbsnp/CODEs_dbsnp.csv')   

CODEs_hgncAnnos = pd.read_csv(output_dir+'hgnc_annos/CODEs_hgncAnno.csv')  


# kf phenotype CODEs
CODEs_kf = pd.read_csv(output_dir+'kf_phenotypes/CODEs_kf.csv') 

# scHeart CODEs
CODEs_scHeart = pd.read_csv(output_dir+'scHeart/CODEs_scHeart.csv') 

# Hubmap 
CODEs_hubmap = pd.read_pickle(output_dir+'HUBMAPsc/hubmap_CODEs.pickle')


CODEs_all = pd.DataFrame(np.concatenate([CODEs_ortho.values, 
                                         CODEs_genopheno.values,
                                         CODEs_mp.values, 
                                         CODEs_phenomap.values,
                                        CODEs_gtex.values, # CODEs_dbsnp.values
                                         CODEs_hgncAnnos.values,
                                         CODEs_kf.values,
                                         CODEs_scHeart.values,
                                         CODEs_hubmap.values
                                        ]),
                                     columns=['CodeID:ID','SAB','CODE']).drop_duplicates()

print('Codes took',(time.time()-t0)/60,'minutes')

Codes took 0.7405983487764994 minutes


In [14]:
# MUST FIX FORMATTING OF THESE SABs ( should not have any spaces)
#Counter(CODEs_all['SAB']).most_common()

In [16]:
#Counter(CODEs_hubmap['SAB']).most_common()

 #Why only 23 unique donors out of 63 datasets?
# sum(hubmap)	sum(hubmap_cluster)	sum(hubmap_dataset)	sum(hubmap_donor)
# 10931745	   896	                   63	          23


## Combine all Terms (SUIs) into one file
Columns: SUI:ID, name

In [7]:
t0 = time.time()

# Load Ortholog SUIs
SUIs_ortho = pd.read_pickle(output_dir+'orthologs/SUIs_ortho.pickle')

# Load mammalian phenotype ontology SUIs
SUIs_mp = pd.read_pickle(output_dir+'MPO/SUIs_mp_ont.pickle')
SUIs_mp.rename(columns={'SUI':'SUI:ID','Term':'name'},inplace=True) 


# Load GTEx SUIs
SUIs_gtex = pd.read_pickle(output_dir+'GTEx/SUIs_GTEx.pickle')
SUIs_gtex['name'] = SUIs_gtex['name'].astype(str)

# dbSNP SUIs
#SUIs_dbsnp = pd.read_csv(LOCAL_PATH+'dbsnp/SUIs_dbsnp.csv')

# hgnc_annotation SUIs, only adding  the '+' and '-' strand Terms
SUIs_hgncAnno = pd.read_csv(output_dir+'hgnc_annos/SUIs_hgncAnnos.csv')

# glygen glycosyltransferase/glycan annotations (on genes)
SUIs_glygenAnno = pd.read_csv(output_dir+'glygen_annos/SUIs_glygenAnnos.csv')

# scHeart SUIs
SUIs_scHeart = pd.read_csv(output_dir+'scHeart/SUIs_scHeart.csv')

SUIs_all = pd.concat([SUIs_ortho,
                      SUIs_gtex, 
                      SUIs_mp,
                      SUIs_hgncAnno,
                      SUIs_glygenAnno,
                      SUIs_scHeart]) 


SUIs_all.drop_duplicates('name',inplace=True)
SUIs_all.drop_duplicates('SUI:ID',inplace=True)

# Check that all SUI:IDs are in the same format (same length)
#SUIs_all_len = [len(i) for i in SUIs_all['SUI:ID']]  # Some of the SUIs are from UMLS so they have length 8 or 9
#assert len(np.unique(SUIs_all_len)) ==  1 # lengths will not be the same using the base 64 method
assert 0 == len(SUIs_all[SUIs_all['SUI:ID'].duplicated()])

print('SUIs took',(time.time()-t0)/60,'minutes')

SUIs took 0.07345730463663737 minutes


## Combine the CODE-SUIs into one file
cols = :START_ID, :END_ID, :TYPE, CUI
:START_ID = CodeID

In [8]:
t0 = time.time()


# Ortholog  CODE-SUIs
CODE_SUI_ortho = pd.read_pickle(output_dir+'orthologs/CODE_SUI_ortho.pickle')

# Load Mammalian Phenotype CODE-SUIs
CODE_SUIs_mp = pd.read_pickle(output_dir+'MPO/CODE_SUIs_mp_ont.pickle')

# Load GTEx CODE-SUIs
CODE_SUIs_GTEX = pd.read_pickle(output_dir+'GTEx/CODE_SUIs_GTEx.pickle')

# Load dbsnp CODE-SUIs
#CODE_SUIs_dbsnp = pd.read_csv(LOCAL_PATH+'dbsnp/CODE_SUIs_dbsnp.csv')

#  Load hgnc_annotation CODE-SUIs
CODE_SUIs_hgncAnno = pd.read_csv(output_dir+'hgnc_annos/CODE_SUIs_hgncAnnos.csv') 


# glygen CODE-SUIs glycosyltransferase/glycan annotations (on genes)
CODE_SUIs_glygenAnno = pd.read_csv(output_dir+'glygen_annos/CODE_SUIs_glygenAnnos.csv') 


#  Load scHeart CODE-SUIs
CODE_SUIs_scHeart = pd.read_csv(output_dir+'scHeart/CODE_SUIs_scHeart.csv') 

# Hubmap 
CODE_SUIs_hubmap = pd.read_pickle(output_dir+'HUBMAPsc/hubmap_CODE_SUIs.pickle')

CODE_SUIs_all = pd.concat([CODE_SUI_ortho,CODE_SUIs_GTEX,CODE_SUIs_mp,
                           CODE_SUIs_hgncAnno,
                           CODE_SUIs_glygenAnno,
                           CODE_SUIs_scHeart,CODE_SUIs_hubmap]) # 

assert CODE_SUIs_all.isna().sum().sum() == 0

#  Assert No overlap
assert set(CODE_SUIs_mp[':START_ID']) & set(CODE_SUIs_GTEX[':START_ID']) &  set(CODE_SUI_ortho[':START_ID']) == set()
assert set(CODE_SUIs_mp[':END_ID']) & set(CODE_SUIs_GTEX[':END_ID']) &  set(CODE_SUI_ortho[':END_ID']) == set()

print('CODE-SUIs took',(time.time()-t0)/60,'minutes')

CODE-SUIs took 0.38557483355204264 minutes


In [158]:
# Find number of GTEX EQTL Terms
#sum(CODE_SUIs_all[':START_ID'].str.startswith('GTEX_EQTL',na=False))
# Find number of GTEX EXP Terms
#sum(CODE_SUIs_all[':START_ID'].str.startswith('GTEX_EXP',na=False))

## Combine CUI-SUIs (from MP and Orthologs steps), need to add ortholog CUI-SUIs
Columns = :START_ID, :END_ID  
start = CUI, end = SUI

## Load HUBMAP CSVs once youve moved them from cluster to local

Unnamed: 0,CUI:ID
0,SENPUCBIQ09QOkV4dDI=
1,SENPUCBIQ09QOkFueGE4
2,SENPUCBIQ09QOkF2cHIxYg==
3,SENPUCBIQ09QOkNjbDE5
4,SENPUCBIQ09QOkdtMjU2NA==
...,...
14952805,SFVCTUFQIENMVVNURVIgYmRmZWJiZmJiYWEzM2JmMWQyYT...
14952806,SFVCTUFQIENMVVNURVIgYmRmZWJiZmJiYWEzM2JmMWQyYT...
14952807,SFVCTUFQIENMVVNURVIgYmRmZWJiZmJiYWEzM2JmMWQyYT...
14952808,SFVCTUFQIENMVVNURVIgYmRmZWJiZmJiYWEzM2JmMWQyYT...


In [159]:
t0 = time.time()


# Check that there are no surprise Nans, remove the Nans we know about ( just 2 in CODE_SUIs)
assert CUIs_all.isna().sum().sum() == 0
assert CUI_CUIs_all.isna().sum().sum() == 0
assert CUI_CODEs_all.isna().sum().sum() == 0
assert CODEs_all.isna().sum().sum() == 0
assert SUIs_all.isna().sum().sum() == 0
assert CODE_SUIs_all.isna().sum().sum() == 0

print('Assertions took',(time.time()-t0)/60,'minutes')

Assertions took 1.4107109308242798 minutes


In [11]:
###### Check that all CUIs in CUI-CUI and CUI-CODE are in CUI_all
#a=CUI_CUIs_all[CUI_CUIs_all[':START_ID'].str.contains('KC')] # filter for  only  the  'KC' CUIs
#b= a[a[':END_ID'].str.contains('KC')]      # filter for  only  the  'KC' CUIs

#assert np.all(b[':START_ID'].isin(CUIs_all['CUI:ID']))  # all :START_ID CUIs are  in master  CUI list
#assert np.all(b[':END_ID'].isin(CUIs_all['CUI:ID']))  # all :END_ID CUIs are  in master  CUI list


######  Make sure there is 100% overlap b/t CodeID from CUI_CODEs_all and  CodeIDs from CODEs_all
#codeIDs_in = CUI_CODEs_all[CUI_CODEs_all[':END_ID'].isin(CODEs_all['CodeID:ID'])]
codeIDs_not_in = CUI_CODEs_all[~CUI_CODEs_all[':END_ID'].isin(CODEs_all['CodeID:ID'])]  # which CODEs are/arent  in CODEs_all?

assert  len(codeIDs_not_in)  == 0

In [10]:
# Check that all CodeIDs in CUI_CODEs are in CODEs_all
#codes_check = CUI_CODEs_all[':END_ID']  #  CODEs are in  the  :END_ID column
#codes_check.isin(CODEs_all['CodeID:ID']).sum()

## Load the UMLS files and concatenate ours to them
UMLS CSV files are in the 'new_build_csv_data' folder  (new build refers to the UMLS version with UBERON & CL included).

In [17]:
1 ###################################################################################
from multiprocessing import Pool
#import dask

In [52]:
#UMLS_CODEs_SUIs[UMLS_CODEs_SUIs.isnull().any(axis=1)]
#UMLS_CODEs_SUIs[UMLS_CODEs_SUIs['CUI'] == 'dG9waWMgMzY1Ng==']#[':END_ID'][9912137]

In [79]:
os.cpu_count()
import dask

In [11]:
import dask.dataframe as dd

from concurrent.futures import ThreadPoolExecutor

def save_csv(t):
    t[0].to_csv(t[1])
    print('finished writing'+t[1],chunksize=1000000)
    
def read_csv(filename):
    'converts a filename to a pandas dataframe'
    return pd.read_csv(filename,na_filter = False)

  import pandas.util.testing as tm


In [135]:
chunksize=6000000

In [136]:
%%time
CUIs_all.to_csv(umls_dir+'delete_cuiall.csv',chunksize=chunksize)
CUI_CUIs_all.to_csv(umls_dir+'delete_cuicuiall.csv',chunksize=chunksize)
CUI_CODEs_all.to_csv(umls_dir+'delete_cuicodeall.csv',chunksize=chunksize)
CODEs_all.to_csv(umls_dir+'delete_codesall.csv',chunksize=chunksize)    

# Wall time: 8min 11s,   
# Wall time: 7min 58s w/ chunksize=1mil
# Wall time: 7min 47s w/ chunksize=2mil
# Wall time: 7min 46s w/ chunksize=4mil
# Wall time: 7min 45s w/ chunksize=6mil

CPU times: user 6min 24s, sys: 52.2 s, total: 7min 16s
Wall time: 7min 45s


In [133]:
len(CUI_CODEs_all)

14958042

In [146]:
%%time
# Wall time: 7min 48s w chunksize = 200k, 8 workers
# Wall time: 8min 3s w chunksize = 200k, 4 workers
file_list_to_save = [ [CUIs_all,umls_dir+'delete_cuiall.csv'],
                       [CUI_CUIs_all,umls_dir+'delete_cuicuiall.csv'],
                     [CUI_CODEs_all,umls_dir+'delete_cuicodeall.csv'],
                     [CODEs_all,umls_dir+'delete_codesall.csv']]


with ThreadPoolExecutor(max_workers=4) as threads:
    df_list = threads.map(save_csv, file_list_to_save)

CPU times: user 11min 52s, sys: 1min 8s, total: 13min
Wall time: 12min 51s


In [100]:
%%time

from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor() as threads:
    df_list = threads.map(read_csv, file_list)

  result = self.fn(*self.args, **self.kwargs)


CPU times: user 53.9 s, sys: 11 s, total: 1min 4s
Wall time: 57.6 s


In [76]:
#df_list

file_list[:2]

['/Users/stearb/Desktop/hubmap-kg/FREEZE/CUIs.csv',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs.csv']

In [74]:
file_list = [umls_dir+'CUIs.csv',umls_dir+'CUI-CUIs.csv',umls_dir+'CODEs.csv',
             umls_dir+'CUI-CODEs.csv',umls_dir+'SUIs.csv',umls_dir+ 'CODE-SUIs.csv',umls_dir+'CUI-SUIs.csv']

In [140]:
#%%time
#UMLS_CUIs = pd.read_csv(umls_dir+'CUIs.csv')
#UMLS_CUI_CUIs = pd.read_csv(umls_dir+'CUI-CUIs.csv')

In [139]:
#%%time
#UMLS_CUIs = dd.read_csv(umls_dir+'CUIs.csv')
#UMLS_CUI_CUIs = dd.read_csv(umls_dir+'CUI-CUIs.csv')

In [12]:
%%time
t0 = time.time()

UMLS_CUIs = dd.read_csv(umls_dir+'CUIs.csv').compute()

UMLS_CUI_CUIs = dd.read_csv(umls_dir+'CUI-CUIs.csv').compute()

UMLS_CODEs = dd.read_csv(umls_dir+'CODEs.csv').compute()

UMLS_CUI_CODEs = dd.read_csv(umls_dir+'CUI-CODEs.csv').compute()

UMLS_SUIs  = dd.read_csv(umls_dir+'SUIs.csv').compute()

# Need to use na_filter = False to prevent the :TYPE 'NA' from being cast to NaN
UMLS_CODEs_SUIs  = dd.read_csv(umls_dir+ 'CODE-SUIs.csv',na_filter = False).compute()

# Drop rows that contain empty string, ('').
UMLS_CODEs_SUIs = UMLS_CODEs_SUIs[UMLS_CODEs_SUIs[':END_ID'].astype(bool)]

UMLS_CUI_SUIs = dd.read_csv(umls_dir+'CUI-SUIs.csv').compute()

print('Loading UMLS files w/ Dask took',(time.time()-t0)/60,'minutes')

  return func(*args, **kwargs)


Loading UMLS files w/ Dask took 0.9900113940238953 minutes
CPU times: user 1min, sys: 12.9 s, total: 1min 13s
Wall time: 59.4 s


In [92]:
# its slower to save a dask df than it is to save a pandas df

CPU times: user 1min 37s, sys: 6.82 s, total: 1min 44s
Wall time: 1min 36s


['/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/00.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/01.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/02.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/03.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/04.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/05.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/06.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/07.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/08.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/09.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/10.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/11.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/12.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/13.part',
 '/Users/stearb/Desktop/hubmap-kg/FREEZE/CUI-CUIs_2.csv/14.part',
 '/Users/s

In [88]:
%%time

UMLS_CUIs.to_csv(umls_dir+'CUIs.csv')

UMLS_CUI_CUIs.to_csv(umls_dir+'CUI-CUIs.csv')

CPU times: user 53.3 s, sys: 5.15 s, total: 58.4 s
Wall time: 1min


In [None]:
1 ##############################################################################

# These must be csvs, unless we store them someplace else and pickle them in and dsave as csvs in freeze import folder?

In [160]:
t0 = time.time()


UMLS_CUIs = pd.read_pickle(umls_dir+'CUIs.pickle')

UMLS_CUI_CUIs = pd.read_pickle(umls_dir+'CUI-CUIs.pickle')

UMLS_CODEs = pd.read_pickle(umls_dir+'CODEs.pickle')

UMLS_CUI_CODEs = pd.read_pickle(umls_dir+'CUI-CODEs.pickle')

UMLS_SUIs = pd.read_pickle(umls_dir+'SUIs.pickle')

# Need to use na_filter = False to prevent the :TYPE 'NA' from being cast to NaN
UMLS_CODEs_SUIs = pd.read_pickle(umls_dir+ 'CODE-SUIs.pickle')#,na_filter = False) 

# Drop rows that contain empty string, ('').
UMLS_CODEs_SUIs = UMLS_CODEs_SUIs[UMLS_CODEs_SUIs[':END_ID'].astype(bool)]

UMLS_CUI_SUIs = pd.read_pickle(umls_dir+'CUI-SUIs.pickle')

print('Loading UMLS files took',(time.time()-t0)/60,'minutes')

Loading UMLS files took 1.0659369985262552 minutes


### Concatenate

In [13]:
t0 = time.time()


concat_CUIs = pd.concat([UMLS_CUIs.drop_duplicates(),CUIs_all])

concat_CUI_CUIs = pd.concat([UMLS_CUI_CUIs.drop_duplicates(),CUI_CUIs_all])

concat_CODEs =  pd.concat([UMLS_CODEs.drop_duplicates(),CODEs_all])

concat_CUI_CODEs  = pd.concat([UMLS_CUI_CODEs.drop_duplicates(),CUI_CODEs_all])

concat_SUIs = pd.concat([UMLS_SUIs.drop_duplicates(),SUIs_all])

concat_CODE_SUIs = pd.concat([UMLS_CODEs_SUIs.drop_duplicates(),CODE_SUIs_all])

#concat_CUI_SUIs = pd.concat([UMLS_CUI_SUIs,CUI_SUIs_all]) 
print('Concatenating files took',(time.time()-t0)/60,'minutes')

Concatenating files took 3.1958942492802938 minutes


In [14]:
dfs = [(UMLS_CUIs.drop_duplicates(),CUIs_all),
(UMLS_CUI_CUIs.drop_duplicates(),CUI_CUIs_all),
(UMLS_CODEs.drop_duplicates(),CODEs_all),
(UMLS_CUI_CODEs.drop_duplicates(),CUI_CODEs_all),
(UMLS_SUIs.drop_duplicates(),SUIs_all),
(UMLS_CODEs_SUIs.drop_duplicates(),CODE_SUIs_all)]

In [18]:
%%time
def concat_func(t):
    return pd.concat([t(0),t(1)])

# set up your pool
with Pool(processes=8) as pool: # or whatever your hardware can support

    # have your pool map the file names to dataframes
    df_list = pool.map(concat_func, dfs)


error: 'i' format requires -2147483648 <= number <= 2147483647

# There is already a '+' Term node in  UMLS! Now we have 2... KS73770739369183 and S0782579.  S0782579 is not a 'strandedness' Term though...

### Save the final version of the CSV files

In [14]:
# updated workflow (Using configuration file for loading and saving data)
t0 = time.time()

#concat_CUIs.to_csv(new_UMLS_CSVs_path+'CUIs.csv',index=False)
#concat_CUI_CUIs.to_csv(new_UMLS_CSVs_path+'CUI-CUIs.csv',index=False)
#concat_CODEs.to_csv(new_UMLS_CSVs_path+'CODEs.csv',index=False)
#concat_CUI_CODEs.to_csv(new_UMLS_CSVs_path+'CUI-CODEs.csv',index=False)
#concat_SUIs.to_csv(new_UMLS_CSVs_path+'SUIs.csv',index=False)
#concat_CODE_SUIs.to_csv(new_UMLS_CSVs_path+'CODE-SUIs.csv',index=False)

# Save just to neo4j db import folder 
concat_CUIs.to_csv(umls_out_dir+'CUIs.csv',index=False)
concat_CUI_CUIs.to_csv(umls_out_dir+'CUI-CUIs.csv',index=False)
concat_CODEs.to_csv(umls_out_dir+'CODEs.csv',index=False)
concat_CUI_CODEs.to_csv(umls_out_dir+'CUI-CODEs.csv',index=False)
concat_SUIs.to_csv(umls_out_dir+'SUIs.csv',index=False)
concat_CODE_SUIs.to_csv(umls_out_dir+'CODE-SUIs.csv',index=False)

print('Saving files took',(time.time()-t0)/60,'minutes')

In [87]:
%%time
concat_CUI_CODEs.to_csv(umls_out_dir+'CUI-CODEs.csv',index=False)
concat_SUIs.to_csv(umls_out_dir+'SUIs.csv',index=False)


NameError: name 'concat_CUI_CODEs' is not defined

# Save here 

In [71]:
'''ALTERNATIVE PATHs to save the build

UMLS_KF_2_path = '/Users/stearb/Library/Application Support/com.Neo4j.Relate/Data/dbmss/dbms-02dbd2af-2446-47a9-91d7-4d9a3a435ce0/import/'
desktop_path = '/Users/stearb/desktop/hubmap-kg/final_build_csv_data/'

# UMLS base64 database path
UMLS_base64_path = '/Users/stearb/Library/Application Support/com.Neo4j.Relate/Data/dbmss/dbms-bebc96c2-11d9-4c97-8636-9a630b41457e/import/'

concat_CUIs.to_csv(UMLS_base64_path+'CUIs.csv',index=False)
concat_CUI_CUIs.to_csv(UMLS_base64_path+'CUI-CUIs.csv',index=False)
concat_CODEs.to_csv(UMLS_base64_path+'CODEs.csv',index=False)
concat_CUI_CODEs.to_csv(UMLS_base64_path+'CUI-CODEs.csv',index=False)
concat_SUIs.to_csv(UMLS_base64_path+'SUIs.csv',index=False)
concat_CODE_SUIs.to_csv(UMLS_base64_path+'CODE-SUIs.csv',index=False)
concat_CUIs.to_csv(UMLS_KF_2_path+'CUIs.csv',index=False)
concat_CUI_CUIs.to_csv(UMLS_KF_2_path+'CUI-CUIs.csv',index=False)
concat_CODEs.to_csv(UMLS_KF_2_path+'CODEs.csv',index=False)
concat_CUI_CODEs.to_csv(UMLS_KF_2_path+'CUI-CODEs.csv',index=False)
concat_SUIs.to_csv(UMLS_KF_2_path+'SUIs.csv',index=False)
concat_CODE_SUIs.to_csv(UMLS_KF_2_path+'CODE-SUIs.csv',index=False)
#concat_CUI_SUIs.to_csv('/Users/stearb/desktop/hubmap-kg/final_build_csv_data/CUI-SUIs.csv',index=False)


concat_CUIs.to_csv(desktop_path+'CUIs.csv',index=False)
concat_CUI_CUIs.to_csv(desktop_path+'CUI-CUIs.csv',index=False)
concat_CODEs.to_csv(desktop_path+'CODEs.csv',index=False)
concat_CUI_CODEs.to_csv(desktop_path+'CUI-CODEs.csv',index=False)
concat_SUIs.to_csv(desktop_path+'SUIs.csv',index=False)
concat_CODE_SUIs.to_csv(desktop_path+'CODE-SUIs.csv',index=False)
#concat_CUI_SUIs.to_csv('/Users/stearb/desktop/hubmap-kg/final_build_csv_data/CUI-SUIs.csv',index=False)'''

## Calculate number of new nodes, of each type, and number of new relationships, of each type that we will be adding to the graph and do some final checks (asserts)

In [17]:
1019564+ 819624

1839188

CUIs_all

In [187]:
# Print total number of CUIs were adding
print(f'# of KF CUIs: {CUIs_all.shape[0]}')

# Check that all CUIs we're adding are KF CUIs (start with 'K')
assert CUIs_all.shape[0] == CUIs_all['CUI:ID'].str.contains('K').sum() 

# Check that there are no duplicates
assert CUIs_all['CUI:ID'].duplicated().sum() == 0

# of KF CUIs: 3401693


CUI_CUIs_all

In [188]:
# Check that all CUIs in CUI_CUIs_all are in CUIs_all

# First remove all UMLS CUI IDs (starts with a C) from the ':START_ID' and 'END_ID' cols
cca_allKF = CUI_CUIs_all[CUI_CUIs_all[':START_ID'].str.contains('K') & CUI_CUIs_all[':END_ID'].str.contains('K')]
print(len(cca_allKF))
assert np.all(cca_allKF[':START_ID'].isin(CUIs_all['CUI:ID']))
assert np.all(cca_allKF[':END_ID'].isin(CUIs_all['CUI:ID']))

Counter(CUI_CUIs_all[':TYPE'])

1367006


Counter({'has_mouse_ortholog': 66754,
         'has_human_ortholog': 66754,
         'gene_associated_with_disease': 234042,
         'disease_has_associated_gene': 234042,
         'SCO': 14165,
         'has_mouse_phenotype': 1218,
         'has_human_phenotype': 1218,
         'median_expression_in_gene': 1839188,
         'median_expression_in_tissue': 1839188,
         'in_gene': 884757,
         'in_tissue': 884757,
         'in_variant': 884757})

In [22]:
# Show a row from each of the relationship types
#CUI_CUIs_all.drop_duplicates(':TYPE')

# ortholog         :   mouse gene <---> human gene
# has_phenotype    :   mouse gene <---> mouse phenotype
# SCO              :   MP child   <---> MP adult
# hpo_mp_crosswalk :   HPO term   <---> MP term
# in_gene          :   eQTL       <---> human gene
# in_tissue        :   eQTL       <---> (human) tissue

#### CUI_CODEs_all

In [61]:
CUI_CODEs_all.shape

(3401693, 2)

#### CODEs

In [64]:
Counter(CODEs_all['SAB']) #.nunique()

Counter({'HGNC HCOP': 27357,
         'MP': 14274,
         'GTEX EXP': 1839188,
         'GTEX EQTL': 884757,
         'DBSNP 151': 636117})

In [71]:
Counter([i[0] for i in CODEs_all['CodeID:ID'].str.split(' ')])

Counter({'HCOP': 27357, 'MP': 14274, 'GTEX': 2723945, 'dbSNP_151': 636117})

In [21]:
#CODEs_all[CODEs_all['CodeID:ID'].str.contains('KC')]

In [None]:
#### Check that the Gene to Gene and MP to HPO relationships are  1 to  1 (at the CUI-CODE level). 
#This will ensure that there are no collisions with CUIs.  
#The genes overlap in the ortholog step and the genotype/phenotype step.  
#The MP terms overlap in the genotype/phenotype step and HPO MP mapping step.

##### Ortholog file (CUI-CODE gene mappings)
#orthologs = pd.read_csv('/Users/stearb/desktop/R03_local/data/UI_check/orthologs_uicheck.csv')

# Dont need to check human genes CUIs, CODEs, we used the UMLS HGNC CUIs and CODEs
#mouse_genes_ortho = orthologs[['CUI_mouse','CODE_mouse']]
#assert mouse_genes_ortho.nunique()['CUI_mouse']  ==  mouse_genes_ortho.nunique()['CODE_mouse'] 

##### Genotype to Phenotype files  (CUI-CODE gene mappings)
#genopheno = pd.read_csv('/Users/stearb/desktop/R03_local/data/UI_check/MASTER_G2P.csv')
#mouse_pheno_g2p = genopheno[['CUI_mp_term','CODE_mp_term']]
#mouse_gene_g2p = genopheno[['CODE_mouse_gene','CUI_mouse_gene']]

##### HPO to MP mapping files (CUI-CODE MP mappings)
#hpo2mp = pd.read_csv('/Users/stearb/desktop/R03_local/data/UI_check/hpo_mp_mappings.csv')
#muse_pheno_hpo2mp = hpo2mp[['CUI_MP','CODE_MP']]

##### Mammalian Phenotype Ontology files  (CUI-CODE MP mappings)
#mp_ont = pd.read_csv('/Users/stearb/desktop/R03_local/data/UI_check/mp_ontology_uicheck.csv')
#mouse_pheno_mpOnt = mp_ont[['CUI','MP_term']]

##### Compare mouse gene CUIs and CODEs
#cui_mouse_genes = pd.DataFrame(np.concatenate([mouse_genes_ortho.values,
#                                               mouse_pheno_g2p.values,mouse_pheno_mpOnt.values]),columns=['CUI','CODE'])
# Check that there are equal number of unique mouse gene CUIs and CODEs. We need to 
# check this because there may be multiple CUIs, but the multiples alwsays need to map to the same CODE,
# ie, if the same CUI maps to different CODEs, we have a collision
#assert cui_mouse_genes.nunique()['CUI'] == cui_mouse_genes.nunique()['CODE']

#### Compare MP term CUIs and CODEs
#assert mouse_pheno_g2p.nunique()['CUI_mp_term'] == mouse_pheno_g2p.nunique()['CODE_mp_term']
#assert mouse_pheno_hpo2mp.nunique()['CUI_MP'] == mouse_pheno_hpo2mp.nunique()['CODE_MP']
#assert mouse_pheno_mpOnt.nunique()['CUI'] == mouse_pheno_mpOnt.nunique()['MP_term']
# Combine MP CUIs/CODEs from genophenotype, phenomapping and mp_ontology steps together.
#cui_mp_terms  = pd.DataFrame(np.concatenate([mouse_pheno_g2p.values,
#                                             mouse_pheno_hpo2mp.values,
#                                             mouse_pheno_mpOnt.values]),columns=['CUI','CODE'])
# If the mapping is 1 to 1 both columns should have the same number of unique values
#assert cui_mp_terms.nunique()['CUI'] == cui_mp_terms.nunique()['CODE']

#### Overlap in (mp_term) CUIs and CODEs should be identical from the 3 lists
# Overlap in CODEs
#venn3([set(mouse_pheno_g2p['CODE_mp_term']),set(mouse_pheno_hpo2mp['CODE_MP']),set(mouse_pheno_mpOnt['MP_term'])]) ; plt.show()
# Overlap in CUIs
#venn3([set(mouse_pheno_g2p['CUI_mp_term']),set(mouse_pheno_hpo2mp['CUI_MP']),set(mouse_pheno_mpOnt['CUI'])]) ; plt.show()
#### Must also load some form of metadata here as well, so we can determine if we have multiple instances of the same CUI,or if its an actual collision (can't do that with CUI alone).
### The reason we don't have to compare the CUI-CODE 1 to 1 mapping for all of the CUI lists (like we did for the mouse gene CUIs list and the MP CUIs list) is because the other lists (GTEx and )

In [None]:
'''
assert UMLS_CUIs.isna().sum().sum() == 0
assert UMLS_CUI_CUIs.isna().sum().sum() == 0
assert UMLS_CODEs.isna().sum().sum() == 5; 
UMLS_CODEs.dropna(inplace=True); 
assert UMLS_CODEs.isna().sum().sum() == 0

assert UMLS_CUI_CODEs.isna().sum().sum() == 0
assert UMLS_SUIs.isna().sum().sum() == 12; 
UMLS_SUIs.dropna(inplace=True); 
assert UMLS_SUIs.isna().sum().sum() == 0

assert UMLS_CODEs_SUIs.isna().sum().sum() == 0; 
#UMLS_CODEs_SUIs.dropna(inplace=True); 
#assert UMLS_CODEs_SUIs.isna().sum().sum() == 0
assert UMLS_CUI_SUIs.isna().sum().sum() == 0'''


'''UMLS_CUIs.to_pickle(umls_dir+'CUIs.pickle')

UMLS_CUI_CUIs.to_pickle(umls_dir+'CUI-CUIs.pickle')

UMLS_CODEs.to_pickle(umls_dir+'CODEs.pickle')

UMLS_CUI_CODEs.to_pickle(umls_dir+'CUI-CODEs.pickle')

UMLS_SUIs.to_pickle(umls_dir+'SUIs.pickle')

# Need to use na_filter = False to prevent the :TYPE 'NA' from being cast to NaN
UMLS_CODEs_SUIs.to_pickle(umls_dir+ 'CODE-SUIs.pickle')#,na_filter = False) 

# Drop rows that contain empty string, ('').
#UMLS_CODEs_SUIs = UMLS_CODEs_SUIs[UMLS_CODEs_SUIs[':END_ID'].astype(bool)]

UMLS_CUI_SUIs.to_pickle(umls_dir+'CUI-SUIs.pickle')'''