## Load UKBiobank phenotypes and Pubchem identifiers from SIDER curation

match to our UKB phenotypes

match to our pubchem

In [1]:
se_curate = pd.read_csv("intermediate_files/sider_se_ukb_match.txt",sep="\t",index_col=0)

In [61]:
se_curate.head()

Unnamed: 0,stitch_flat,stitch_stereo,UMLS_label,type,meddra,se_name,pubchem_cid,ukb,ae
8,CID100000085,CID000010917,C0002871,PT,C0002871,Anaemia,85,"Non-cancer illness code, self-reported: anaemia",1
14,CID100000085,CID000010917,C0003811,PT,C0003811,Arrhythmia,85,"Non-cancer illness code, self-reported: heart ...",1
18,CID100000085,CID000010917,C0004238,PT,C0004238,Atrial fibrillation,85,"Non-cancer illness code, self-reported: atrial...",1
20,CID100000085,CID000010917,C0004604,PT,C0004604,Back pain,85,Dorsalgia,1
22,CID100000085,CID000010917,C0006277,PT,C0006277,Bronchitis,85,Bronchitis,1


In [2]:
match_se = se_curate.loc[:,['ukb','se_name']].drop_duplicates()
match_se['se_name' ] = match_se['se_name'].str.lower()
match_se = match_se.set_index('se_name')
match_se.head()

Unnamed: 0_level_0,ukb
se_name,Unnamed: 1_level_1
anaemia,"Non-cancer illness code, self-reported: anaemia"
arrhythmia,"Non-cancer illness code, self-reported: heart ..."
atrial fibrillation,"Non-cancer illness code, self-reported: atrial..."
back pain,Dorsalgia
bronchitis,Bronchitis


## Curate Galeano

https://static-content.springer.com/esm/art%3A10.1038%2Fs41467-020-18305-y/MediaObjects/41467_2020_18305_MOESM2_ESM.pdf

https://paccanarolab.org/drug-signatures/ saved in folder "galeano"

In [13]:
s8 = pd.read_csv("galeano/Supplementary Data 8.txt",sep="\t")
s8.head()

s7 = pd.read_csv("galeano/Supplementary Data 7.txt",sep="\t")


s7['generic2'] = s7['GenericName'].str.replace("."," ")

s7_dedup = s7.loc[:,['CID','GenericName','generic2']].drop_duplicates()

s8['cid'] = s7_dedup.set_index('generic2').loc[s8['GenericDrugName'],'CID'].values

Our previous work matches the meddra terms to UKBiobank terms so we can make predictions

In [65]:
s8_match = s8.loc[s8['SideEffectTerm'].isin(match_se.index),:].copy()
s8_match['ukb'] = match_se.loc[s8_match['SideEffectTerm'],'ukb'].values

s8_match.to_csv("evaluations/galeano_match_cid_ukb.txt",sep="\t")

## Drug voyager

Took all of our pubchem identifiers and used the [Pubchem identifier exchange](https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange-help.html#inputid) to convert to drugbank to match DrugVoyager

In [73]:
dv_se = pd.read_excel('https://static-content.springer.com/esm/art%3A10.1186%2Fs12859-017-1558-3/MediaObjects/12859_2017_1558_MOESM3_ESM.xlsx')


p2d = pd.read_table("pubchem2drugbank.txt", header=None)
ps2d = pd.read_table("pubchem_stereo_2_drugbank.txt", header=None)

se_curate['db'] = p2d.set_index(0).loc[se_curate['pubchem_cid'],1].values
se_curate['pubchem2'] = se_curate.stitch_stereo.str.slice(3).map(int)
se_curate['db'] = se_curate['db'].mask(pd.isnull(se_curate['db']), ps2d.set_index(0).loc[se_curate['pubchem2'],1].values)

se_df = se_curate.loc[:,['UMLS_label','se_name','ukb']].drop_duplicates().set_index('UMLS_label')
se_find = []
nones = []
mults = []
se_names = []
ukb = []
db_match = []
cuis_match = []
for ix, row in dv_se.iterrows(): #enumerate(dv_se['Side Effect\n(UMLS)']):
    #cuis = set(cuilist.split(","))  & set(se_df.index)
    cuis = row['Side Effect\n(UMLS)']
    db = row['Drug\n(DrugBank)']
    if not cuis in se_df.index:
        se_find.append('')
        nones.append(ix)
        se_names.append('')
        ukb.append('')
        db_match.append(db)
        cuis_match.append(cuis)
    else:
        se = se_df.loc[cuis,:]
        if len(se.shape) > 1:
            mults.append(ix)
            db_match += [db]*se.shape[1]
            se_names += list(se['se_name'].values)
            ukb += list(se['ukb'].values)
            cuis_match += [cuis]*se.shape[1]
        else:
            db_match.append(db)
            se_names.append(se['se_name'])
            ukb.append(se['ukb'])
            cuis_match.append(cuis)
        se_find.append(se)
        
            

se_dv_match = pd.DataFrame({'db':db_match,'se_names':se_names, 'ukb':ukb, 'cui':cuis_match})

db2cid = se_curate.loc[:,['pubchem_cid','db']].drop_duplicates().set_index('db')['pubchem_cid']

se_dv_match = pd.DataFrame({'db':db_match,'se_names':se_names, 'ukb':ukb, 'cui':cuis_match})

se_dv_match = se_dv_match.loc[se_dv_match['db'].isin(set(db2cid.index)),:]
se_dv_match['cid'] = db2cid.loc[se_dv_match['db']].values

se_dv_match.to_csv("evaluations/drugvoyager_match.txt",sep="\t")