In [1]:
#Objective - pull the ICD9 text from the GWAS catalog URLs, then use the latest phecode mapping
#Inputs - gwas_catalog_v1.0.2-associations.tsv, phecode_icd9_rolled.csv
#Outputs - GWASCatalog_ICD9_Mapped.csv

In [1]:
import pandas as pd
import numpy as np
from urllib import request
import urllib, re, gc, requests
from pathos.helpers import mp
from itertools import chain

In [2]:
GWASCat = pd.read_table('gwas_catalog_v1.0.2-associations.tsv')
GWASCat = GWASCat[~(GWASCat.MAPPED_TRAIT_URI.astype(str)=='nan')]

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
#workaround to str.split() - @https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows/40449726
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [None]:
def ICDDictMapJSON(URLList): #JSON read more reliable than scrape?
    DictOut = dict()
    for url in URLList:
        Suburl = re.split('\,', url)
        Suburl = [re.sub(' ', '', x) for x in Suburl]
        minilist = []
        for suburl in Suburl:
            #Multiple URLs may exist.. split into list and find all ICDs within
            urlopen = 'https://www.ebi.ac.uk/ols/api/ontologies/efo/terms?iri={0}'.format(suburl)
            urlget = requests.get(urlopen).json()
            try:
                #Ridiculous structure
                icds = urlget['_embedded']['terms'][0]['annotation']['ICD9_definition_citation']
                icds = [re.sub('ICD9\:', '', x) for x in icds]
                minilist.append(icds)
            except:
                continue
        Add = list(set(chain.from_iterable(minilist)))
        DictOut[url] = list(set(Add))
    
    return(DictOut)

In [None]:
def OMIMDictMapJSON(URLList):
    DictOut = dict()
    for url in URLList:
        #Multiple URLs may exist.. split into list and find all ICDs within
        Suburl = re.split('\,', url)
        Suburl = [re.sub(' ', '', x) for x in Suburl]
        minilist = []
        for suburl in Suburl:
            #Multiple URLs may exist.. split into list and find all ICDs within
            urlopen = 'https://www.ebi.ac.uk/ols/api/ontologies/efo/terms?iri={0}'.format(suburl)
            urlget = requests.get(urlopen).json()
            try:
                #Ridiculous structure
                omims = urlget['_embedded']['terms'][0]['annotation']['OMIM_definition_citation']
                omims = [re.sub('OMIM\:', '', x) for x in omims]
                minilist.append(omims)
            except:
                continue   
        Add = list(set(chain.from_iterable(minilist)))
        DictOut[url] = list(set(Add))
        
    return(DictOut)

In [6]:
def SNODictMapJSON(URLList):
    DictOut = dict()
    for url in URLList:
        #Multiple URLs may exist.. split into list and find all ICDs within
        Suburl = re.split('\,', url)
        Suburl = [re.sub(' ', '', x) for x in Suburl]
        minilist = []
        for suburl in Suburl:
            #Multiple URLs may exist.. split into list and find all ICDs within
            urlopen = 'https://www.ebi.ac.uk/ols/api/ontologies/efo/terms?iri={0}'.format(suburl)
            urlget = requests.get(urlopen).json()
            try:
                #Ridiculous structure
                snos = urlget['_embedded']['terms'][0]['annotation']['SNOMEDCT_definition_citation']
                snos = [re.sub('SNOMEDCT\:', '', x) for x in snos]
                minilist.append(snos)
            except:
                continue
        Add = list(set(chain.from_iterable(minilist)))
        DictOut[url] = list(set(Add))
        
    return(DictOut)

In [104]:
CoreNum = 30

URLs = list(set(GWASCat.MAPPED_TRAIT_URI))
URLs = [x for x in URLs if str(x)!='nan']

ICDDictList = []
URLSplit = list(np.array_split(URLs, CoreNum))

pooler=mp.Pool(CoreNum)

for result in pooler.imap(ICDDictMapJSON, URLSplit):
    ICDDictList.append(result)
    
pooler.close()
pooler.join()
gc.collect()

1119

In [None]:
CoreNum = 30

URLs = list(set(GWASCat.MAPPED_TRAIT_URI))
URLs = [x for x in URLs if str(x)!='nan']

OMIMDictList = []
URLSplit = list(np.array_split(URLs, CoreNum))

pooler=mp.Pool(CoreNum)

for result in pooler.imap(OMIMDictMapJSON, URLSplit):
    OMIMDictList.append(result)
    
pooler.close()
pooler.join()
gc.collect()

In [None]:
CoreNum = 30

URLs = list(set(GWASCat.MAPPED_TRAIT_URI))
URLs = [x for x in URLs if str(x)!='nan']

SNODictList = []
URLSplit = list(np.array_split(URLs, CoreNum))

pooler=mp.Pool(CoreNum)

for result in pooler.imap(SNODictMapJSON, URLSplit):
    SNODictList.append(result)
    
pooler.close()
pooler.join()
gc.collect()

In [5]:
#Say you have list of dictionaries
FinalICDDict = dict()
for d in ICDDictList:
    FinalICDDict.update(d)
FinalOMIMDict = dict()
for d in OMIMDictList:
    FinalOMIMDict.update(d)
FinalSNODict = dict()
for d in SNODictList:
    FinalSNODict.update(d)

In [None]:
GWASCat['ICD'] = [FinalICDDict[x] for x in GWASCat['MAPPED_TRAIT_URI']]
GWASCat['OMIM'] = [FinalOMIMDict[x] for x in GWASCat['MAPPED_TRAIT_URI']]
GWASCat['SNOMEDCT'] = [FinalSNODict[x] for x in GWASCat['MAPPED_TRAIT_URI']]

In [None]:
GWASCat = explode(GWASCat, ['ICD'], fill_value='')
GWASCat.reset_index(inplace=True)
del GWASCat['index']
GWASCat = explode(GWASCat, ['OMIM'], fill_value='')
GWASCat.reset_index(inplace=True)
del GWASCat['index']
GWASCat = explode(GWASCat, ['SNOMEDCT'], fill_value='')
GWASCat.reset_index(inplace=True)
del GWASCat['index']

In [None]:
GWASCat.to_csv('GWASCatalog_ICD9_OMIM_SNOMEDCT_Map.csv')