In [1]:
#Objective - pull the ICD9 text from the GWAS catalog URLs, then use the latest phecode mapping
#Inputs - gwas_catalog_v1.0.2-associations.tsv, phecode_icd9_rolled.csv
#Outputs - GWASCatalog_ICD9_Mapped.csv

In [10]:
import pandas as pd
import numpy as np
from urllib import request
import urllib, re, gc, requests
from pathos.helpers import mp

In [8]:
GWASCat = pd.read_table('gwas_catalog_v1.0.2-associations.tsv')
GWASCat = GWASCat[~(GWASCat.MAPPED_TRAIT_URI.astype(str)=='nan')]

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
GWASCat.head()

Unnamed: 0,DATE ADDED TO CATALOG,PUBMEDID,FIRST AUTHOR,DATE,JOURNAL,LINK,STUDY,DISEASE/TRAIT,INITIAL SAMPLE SIZE,REPLICATION SAMPLE SIZE,...,PVALUE_MLOG,P-VALUE (TEXT),OR or BETA,95% CI (TEXT),PLATFORM [SNPS PASSING QC],CNV,MAPPED_TRAIT,MAPPED_TRAIT_URI,STUDY ACCESSION,GENOTYPING TECHNOLOGY
0,2016-02-03,25778476,Jun G,2015-03-17,Mol Psychiatry,www.ncbi.nlm.nih.gov/pubmed/25778476,A novel Alzheimer disease locus located near t...,Alzheimer's disease in APOE e4- carriers,"7,184 cases, 26,968 controls","718 European ancestry cases, 1,699 European an...",...,6.045757,,1.47,[1.26-1.71],NR [NR],N,Alzheimers disease,http://www.ebi.ac.uk/efo/EFO_0000249,GCST002817,Genome-wide genotyping array
1,2016-02-03,25778476,Jun G,2015-03-17,Mol Psychiatry,www.ncbi.nlm.nih.gov/pubmed/25778476,A novel Alzheimer disease locus located near t...,Alzheimer's disease in APOE e4- carriers,"7,184 cases, 26,968 controls","718 European ancestry cases, 1,699 European an...",...,5.522879,,1.2,[1.11-1.29],NR [NR],N,Alzheimers disease,http://www.ebi.ac.uk/efo/EFO_0000249,GCST002817,Genome-wide genotyping array
2,2016-02-03,25778476,Jun G,2015-03-17,Mol Psychiatry,www.ncbi.nlm.nih.gov/pubmed/25778476,A novel Alzheimer disease locus located near t...,Alzheimer's disease in APOE e4+ carriers,"10,352 cases, 9,207 controls","1,250 European ancestry cases, 536 European an...",...,8.39794,,1.22,[1.14-1.30],NR [NR],N,Alzheimers disease,http://www.ebi.ac.uk/efo/EFO_0000249,GCST002813,Genome-wide genotyping array
3,2016-02-03,25778476,Jun G,2015-03-17,Mol Psychiatry,www.ncbi.nlm.nih.gov/pubmed/25778476,A novel Alzheimer disease locus located near t...,Alzheimer's disease in APOE e4+ carriers,"10,352 cases, 9,207 controls","1,250 European ancestry cases, 536 European an...",...,9.0,,1.2,[1.13-1.27],NR [NR],N,Alzheimers disease,http://www.ebi.ac.uk/efo/EFO_0000249,GCST002813,Genome-wide genotyping array
4,2016-02-03,25778476,Jun G,2015-03-17,Mol Psychiatry,www.ncbi.nlm.nih.gov/pubmed/25778476,A novel Alzheimer disease locus located near t...,Alzheimer's disease in APOE e4+ carriers,"10,352 cases, 9,207 controls","1,250 European ancestry cases, 536 European an...",...,5.39794,,1.16,[1.09-1.23],NR [NR],N,Alzheimers disease,http://www.ebi.ac.uk/efo/EFO_0000249,GCST002813,Genome-wide genotyping array


In [10]:
#workaround to str.split() - @https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows/40449726
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, lens)
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [3]:
def ICDDictMap(URLList):
    DictOut = dict()
    for url in URLList:
        #Multiple URLs may exist.. split into list and find all ICDs within
        Suburl = re.split('\,', url)
        Suburl = [re.sub(' ', '', x) for x in Suburl]
        minilist = []
        for suburl in Suburl:
            urlopen = urllib.request.urlopen(suburl)
            results = str(urlopen.read())
            ICDs = re.findall('ICD9:(\d+?\.\d+)', results)
            minilist.append(ICDs)
        DictOut[url] = minilist[0]
    return(DictOut)

In [104]:
CoreNum = 20

URLs = list(set(GWASCat.MAPPED_TRAIT_URI))
URLs = [x for x in URLs if str(x)!='nan']

GWASCatDictList = []
URLSplit = list(np.array_split(URLs, CoreNum))

pooler=mp.Pool(CoreNum)

for result in pooler.imap(ICDDictMap, URLSplit):
    GWASCatDictList.append(result)
    
pooler.close()
pooler.join()
gc.collect()

1119

In [5]:
#Say you have list of dictionaries
FinalDict = dict()
for d in GWASCatDictList:
    FinalDict.update(d)

In [None]:
GWASCat['ICD'] = [FinalDict[x] for x in GWASCat['MAPPED_TRAIT_URI']]

In [None]:
GWASCat = explode(GWASCat, ['ICD'], fill_value='')
GWASCat.reset_index(inplace=True)
del GWASCat['index']

In [None]:
GWASCat.to_csv('GWASCatalog_ICD9_Mapped.csv')

In [None]:
#Slow/old way below

In [41]:
#def ICDMap(DF):
#    url = DF.MAPPED_TRAIT_URI
#    urlopen = urllib.request.urlopen(url)
#    results = str(urlopen.read())
#    ICDs = re.findall('ICD9:(\d+?\.\d+)', results)
#    DF['ICDs'] = ICDs
#    return(DF)

In [44]:
#def ApplyWrap(DF):
#    return(DF.apply(ICDMap, axis = 1))

In [119]:
#mini = GWASCat[0:100]
#CoreNum = 4

#GWASCatList = []
#GWASSplit = list(np.array_split(mini, CoreNum))

#pooler=mp.Pool(CoreNum)

#for result in pooler.imap(ApplyWrap, GWASSplit):
#    GWASCatList.append(result)
    
#pooler.close()
#pooler.join()
#gc.collect()

In [80]:
#mini = GWASCat[0:1000]

In [180]:
#%%time
##Guaranteed to be faster.. dictionary of a url to ICD - 3min per 1000 rows
#URLMap = dict()
#URLs = list(set(mini.MAPPED_TRAIT_URI))
#URLs = [x for x in URLs if str(x)!='nan']
#for url in URLs:
    #Multiple URLs may exist.. split into list and find all ICDs within
#    Suburl = re.split('\,', url)
#    Suburl = [re.sub(' ', '', x) for x in Suburl]
#    minilist = []
#    for suburl in Suburl:
#        urlopen = urllib.request.urlopen(suburl)
#        results = str(urlopen.read())
#        ICDs = re.findall('ICD9:(\d+?\.\d+)', results)
#        minilist.append(ICDs)
#    URLMap[url] = minilist[0]

In [None]:
#Do phecode mapping in R instead

In [111]:
#Phecode map
#phemap = pd.read_csv('phecode_icd9_rolled.csv')
#phedict = dict(zip(phemap.ICD9, phemap.PheCode))

In [126]:
#def ICDPhecodeMap(DF):
#    phecodemap = []
#    for icd in DF.ICD:
#        try:
#            addme = phedict[icd]
#            phecodemap.append(addme)
#       except:
#            continue
    #phecodemap = [phedict[x] for x in DF.ICD]
#    DF['Phecode'] = phecodemap
#    return(DF)

In [118]:
#GWASCat = GWASCat.apply(ICDPhecodeMap, axis = 1)

In [None]:
#GWASCat.to_csv('GWASCatalog_ICD9_Phecode_Mapped.csv')