# Predictions evaluation

On reproduction/2-predictions:

    * Model (Log. Reg.): P: 0.966, R: 0.966, F: 0.964  10-fold cross-validation
    * Train set: shape = 1:10 size = 8306 dim = (8306, 201)
    * Predictions: size = 210706  

In [158]:
import pandas as pd

### Gold standard

    * source: https://raw.githubusercontent.com/dhimmel/indications/11d535ba0884ee56c3cd5756fdfb4985f313bd80/catalog/indications.tsv
    

In [159]:
# read indications catalog used in the graph
df = pd.read_table('./gold-standard/indications.tsv')
gold_df = df[['doid_id','drugbank_id','disease', 'drug', 'category']].copy()
print(gold_df.size, gold_df.shape)
gold_df.head()

6940 (1388, 5)


Unnamed: 0,doid_id,drugbank_id,disease,drug,category
0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM
1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM
2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM
3,DOID:10652,DB00989,Alzheimer's disease,Rivastigmine,DM
4,DOID:10652,DB00245,Alzheimer's disease,Benzatropine,SYM


In [160]:
gold_df.groupby('category').count()

Unnamed: 0_level_0,doid_id,drugbank_id,disease,drug
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DM,755,755,755,755
NOT,243,243,243,243
SYM,390,390,390,390


In [161]:
# set true indications dataframe
indications_df = gold_df.query('category == "DM"').reset_index().copy()
print(indications_df.shape)
print('diseases: {}, drugs: {}'.format(indications_df.disease.nunique(), indications_df.drug.nunique()))
indications_df.head()

(755, 6)
diseases: 77, drugs: 387


Unnamed: 0,index,doid_id,drugbank_id,disease,drug,category
0,0,DOID:10652,DB00843,Alzheimer's disease,Donepezil,DM
1,1,DOID:10652,DB00674,Alzheimer's disease,Galantamine,DM
2,2,DOID:10652,DB01043,Alzheimer's disease,Memantine,DM
3,3,DOID:10652,DB00989,Alzheimer's disease,Rivastigmine,DM
4,16,DOID:9206,DB00736,Barrett's esophagus,Esomeprazole,DM


In [162]:
# unique diseases
print(indications_df.disease.unique())

["Alzheimer's disease" "Barrett's esophagus" "Crohn's disease"
 "Graves' disease" 'Kawasaki disease' "Paget's disease of bone"
 'acquired immunodeficiency syndrome' 'alcohol dependence'
 'allergic rhinitis' 'alopecia areata' 'amyotrophic lateral sclerosis'
 'ankylosing spondylitis' 'asthma' 'atherosclerosis' 'atopic dermatitis'
 'azoospermia' 'bone cancer' 'brain cancer' 'breast cancer'
 'cervical cancer' 'chronic obstructive pulmonary disease' 'colon cancer'
 'coronary artery disease' 'dilated cardiomyopathy' 'epilepsy syndrome'
 'esophageal cancer' 'focal segmental glomerulosclerosis'
 'germ cell cancer' 'gestational diabetes' 'glaucoma' 'gout'
 'head and neck cancer' 'hematologic cancer' 'hepatitis B' 'hypertension'
 'hypothyroidism' 'kidney cancer' 'leprosy' 'liver cancer' 'lung cancer'
 'lymphatic system cancer' 'malaria' 'malignant glioma' 'melanoma'
 'metabolic syndrome X' 'migraine' 'multiple sclerosis' 'muscle cancer'
 'nephrolithiasis' 'nicotine dependence' 'obesity' 'ocular 

In [163]:
# unique drugs
print(indications_df.drug.unique())

['Donepezil' 'Galantamine' 'Memantine' 'Rivastigmine' 'Esomeprazole'
 'Omeprazole' 'Azathioprine' 'Balsalazide' 'Mercaptopurine' 'Mesalazine'
 'Prednisone' 'Sulfasalazine' 'Methimazole' 'Propylthiouracil'
 'Acetylsalicylic acid' 'Alendronate' 'Etidronic acid' 'Pamidronate'
 'Risedronate' 'Tiludronate' 'Zoledronate' 'Abacavir' 'Amprenavir'
 'Delavirdine' 'Didanosine' 'Efavirenz' 'Indinavir' 'Lamivudine'
 'Lopinavir' 'Nelfinavir' 'Nevirapine' 'Ritonavir' 'Saquinavir' 'Stavudine'
 'Zidovudine' 'Acamprosate' 'Citalopram' 'Disulfiram' 'Naltrexone'
 'Betamethasone' 'Cetirizine' 'Cyproheptadine' 'Desloratadine'
 'Dexamethasone' 'Dimenhydrinate' 'Diphenhydramine' 'Flunisolide'
 'Hydrocortisone' 'Loratadine' 'Methylprednisolone' 'Montelukast'
 'Olopatadine' 'Prednisolone' 'Triamcinolone' 'Riluzole' 'Methotrexate'
 'Aminophylline' 'Arformoterol' 'Beclomethasone' 'Budesonide' 'Ciclesonide'
 'Cromoglicic acid' 'Dyphylline' 'Fluticasone Propionate'
 'Fluticasone furoate' 'Formoterol' 'Indacaterol' 

### Select instances

The selection of Indications should be divers, i.e. diseases belonging to different disease classes, and with different amount of drug information available for the training step.


In [164]:
# gruop by disease: they range [68 - 1]
indications_df.groupby('disease')['drug'].count().sort_values(ascending=False)

disease
hypertension                          68
hematologic cancer                    51
asthma                                37
breast cancer                         29
coronary artery disease               28
epilepsy syndrome                     25
type 2 diabetes mellitus              22
psoriasis                             21
glaucoma                              21
prostate cancer                       21
ulcerative colitis                    16
lung cancer                           16
atopic dermatitis                     16
allergic rhinitis                     15
osteoporosis                          15
rheumatoid arthritis                  15
acquired immunodeficiency syndrome    14
kidney cancer                         13
systemic lupus erythematosus          13
peripheral nervous system neoplasm    12
multiple sclerosis                    11
obesity                               11
urinary bladder cancer                11
testicular cancer                     11
malaria 

In [165]:
# 1. hypertension DOID:10763 (68 drugs)
indications_df.query('doid_id == "DOID:10763"')

Unnamed: 0,index,doid_id,drugbank_id,disease,drug,category
338,628,DOID:10763,DB01193,hypertension,Acebutolol,DM
339,629,DOID:10763,DB00594,hypertension,Amiloride,DM
340,630,DOID:10763,DB00381,hypertension,Amlodipine,DM
341,631,DOID:10763,DB01076,hypertension,Atorvastatin,DM
342,632,DOID:10763,DB00542,hypertension,Benazepril,DM
343,633,DOID:10763,DB00436,hypertension,Bendroflumethiazide,DM
344,634,DOID:10763,DB01244,hypertension,Bepridil,DM
345,635,DOID:10763,DB00195,hypertension,Betaxolol,DM
346,636,DOID:10763,DB00887,hypertension,Bumetanide,DM
347,637,DOID:10763,DB00796,hypertension,Candesartan,DM


In [166]:
# 2. epilepsy DOID:1826 (25 drugs)
indications_df.query('doid_id == "DOID:1826"')

Unnamed: 0,index,doid_id,drugbank_id,disease,drug,category
211,472,DOID:1826,DB00819,epilepsy syndrome,Acetazolamide,DM
212,473,DOID:1826,DB01351,epilepsy syndrome,Amobarbital,DM
213,474,DOID:1826,DB00564,epilepsy syndrome,Carbamazepine,DM
214,475,DOID:1826,DB00349,epilepsy syndrome,Clobazam,DM
215,476,DOID:1826,DB01068,epilepsy syndrome,Clonazepam,DM
216,477,DOID:1826,DB00829,epilepsy syndrome,Diazepam,DM
217,478,DOID:1826,DB00949,epilepsy syndrome,Felbamate,DM
218,479,DOID:1826,DB01320,epilepsy syndrome,Fosphenytoin,DM
219,480,DOID:1826,DB00996,epilepsy syndrome,Gabapentin,DM
220,481,DOID:1826,DB06218,epilepsy syndrome,Lacosamide,DM


In [167]:
# 3. malaria DOID:12365 (11 drugs)
indications_df.query('doid_id == "DOID:12365"')

Unnamed: 0,index,doid_id,drugbank_id,disease,drug,category
454,808,DOID:12365,DB06697,malaria,Artemether,DM
455,809,DOID:12365,DB01190,malaria,Clindamycin,DM
456,810,DOID:12365,DB00250,malaria,Dapsone,DM
457,811,DOID:12365,DB00254,malaria,Doxycycline,DM
458,812,DOID:12365,DB00806,malaria,Pentoxifylline,DM
459,813,DOID:12365,DB01131,malaria,Proguanil,DM
460,814,DOID:12365,DB00205,malaria,Pyrimethamine,DM
461,815,DOID:12365,DB00908,malaria,Quinidine,DM
462,816,DOID:12365,DB01346,malaria,Quinidine barbiturate,DM
463,817,DOID:12365,DB00468,malaria,Quinine,DM


In [168]:
# 4. thyroid cancer DOID:1781 (4 drugs)
indications_df.query('doid_id == "DOID:1781"')

Unnamed: 0,index,doid_id,drugbank_id,disease,drug,category
693,1311,DOID:1781,DB00997,thyroid cancer,Doxorubicin,DM
694,1312,DOID:1781,DB00445,thyroid cancer,Epirubicin,DM
695,1313,DOID:1781,DB00398,thyroid cancer,Sorafenib,DM
696,1314,DOID:1781,DB05294,thyroid cancer,Vandetanib,DM


In [169]:
# 5. obesity DOID:9970 (11 drugs)
indications_df.query('doid_id == "DOID:9970"')

Unnamed: 0,index,doid_id,drugbank_id,disease,drug,category
501,918,DOID:9970,DB00865,obesity,Benzphetamine,DM
502,919,DOID:9970,DB01156,obesity,Bupropion,DM
503,920,DOID:9970,DB00501,obesity,Cimetidine,DM
504,921,DOID:9970,DB00937,obesity,Diethylpropion,DM
505,922,DOID:9970,DB01577,obesity,Methamphetamine,DM
506,923,DOID:9970,DB01083,obesity,Orlistat,DM
507,924,DOID:9970,DB01579,obesity,Phendimetrazine,DM
508,925,DOID:9970,DB00191,obesity,Phentermine,DM
509,926,DOID:9970,DB00397,obesity,Phenylpropanolamine,DM
510,927,DOID:9970,DB01105,obesity,Sibutramine,DM


### Predictions

In [170]:
# read predictions
predictions_df = pd.read_csv('./reproduction/2-predictions/predictions_mapped.csv', sep=',', header=None)
predictions_df = predictions_df.rename(
    columns={ 
        0: 'drug', 
        1: 'disease', 
        2: 'actual', 
        3: 'predicted', 
        4: 'error', 
        5: 'prediction'
    }
)
predictions_df = predictions_df[['drug', 'disease', 'predicted', 'prediction']]
predictions_df['predicted'] = predictions_df.predicted.apply(lambda x: 'true' if x.split(':')[1] == 't' else 'false')

# Include entity names alongside IDs
# Mapping disease names
url = 'https://raw.githubusercontent.com/dhimmel/disease-ontology/75050ea2d4f60e745d3f3578ae03560a2cc0e444/data/slim-terms.tsv'
disease_df = pd.read_table(url)
disease_df = disease_df[['doid','name','pathophysiology']] 
disease_df['doid'] = ( disease_df
                  .doid
                  .apply(
                      lambda y: y.replace(':','_')
                        )
                          
             )
disease_df = disease_df.rename(columns={'doid': 'disease', 'name': 'disease_name', 'pathophysiology': 'disease_pathophysiology'})

# Mapping drug names
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank-slim.tsv'
compound_df = pd.read_table(url)
compound_df = compound_df[['drugbank_id','name','categories']]
compound_df = compound_df.rename(columns={'drugbank_id': 'drug', 'name': 'drug_name', 'categories': 'drug_categories'})

# mapping names to predictions dataframe
predictions_df = pd.merge(predictions_df,compound_df, how='left', on='drug')
predictions_df = pd.merge(predictions_df,disease_df,how='left',on='disease')

# explore true
print('TRUE INDICATIONS')
print(indications_df.shape)
print('diseases: {}, drugs: {}'.format(indications_df.disease.nunique(), indications_df.drug.nunique()))
print()

# explore predictions
print('PREDICTIONS')
print(predictions_df.shape)
print('diseases: {}, drugs: {}'.format(predictions_df.disease.nunique(), predictions_df.drug.nunique()))
predictions_df.head()

TRUE INDICATIONS
(755, 6)
diseases: 77, drugs: 387

PREDICTIONS
(210706, 8)
diseases: 137, drugs: 1538


Unnamed: 0,drug,disease,predicted,prediction,drug_name,drug_categories,disease_name,disease_pathophysiology
0,DB00843,DOID_10652,False,0.958,Donepezil,,Alzheimer's disease,degenerative
1,DB00843,DOID_9206,False,0.98,Donepezil,,Barrett's esophagus,neoplastic
2,DB00843,DOID_8778,False,0.932,Donepezil,,Crohn's disease,immunologic
3,DB00843,DOID_12361,False,0.98,Donepezil,,Graves' disease,immunologic
4,DB00843,DOID_13378,False,0.991,Donepezil,,Kawasaki disease,immunologic


In [171]:
# 1. hypertension DOID:10763 (68 drugs)
print('Indications: %s' % len(indications_df.query('doid_id == "DOID:10763"')))

# predictions
print('True predictions: %s (%s)' % (len(predictions_df.query('disease == "DOID_10763" & predicted == "true"')),round(len(predictions_df.query('disease == "DOID_10763" & predicted == "true"'))*100/len(predictions_df.query('disease == "DOID_10763"')))))

# overlap
indi = set(indications_df.query('doid_id == "DOID:10763"')[['drug']].drug)
pred = set(predictions_df.query('disease == "DOID_10763" & predicted == "true"')[['drug_name']].drug_name)
overlap = pred & indi
print('Indications predicted: {}\n'.format(len(overlap)))
print(overlap)
print('\nIndications not predicted: {}'.format(indi - pred))

Indications: 68
True predictions: 1154 (75)
Indications predicted: 58

{'Nifedipine', 'Terazosin', 'Bepridil', 'Acebutolol', 'Betaxolol', 'Carvedilol', 'Fosinopril', 'Triamterene', 'Verapamil', 'Timolol', 'Reserpine', 'Candesartan', 'Valsartan', 'Penbutolol', 'Guanadrel', 'Esmolol', 'Atorvastatin', 'Clonidine', 'Fenoldopam', 'Chlorothiazide', 'Metolazone', 'Prazosin', 'Trandolapril', 'Spironolactone', 'Eprosartan', 'Ethacrynic acid', 'Bumetanide', 'Enalapril', 'Isradipine', 'Lisinopril', 'Nicardipine', 'Methyldopa', 'Pindolol', 'Torasemide', 'Benazepril', 'Furosemide', 'Ramipril', 'Doxazosin', 'Irbesartan', 'Mecamylamine', 'Moexipril', 'Eplerenone', 'Amlodipine', 'Perindopril', 'Hydrochlorothiazide', 'Losartan', 'Guanethidine', 'Hydroflumethiazide', 'Chlorthalidone', 'Metoprolol', 'Amiloride', 'Felodipine', 'Nadolol', 'Nisoldipine', 'Indapamide', 'Nebivolol', 'Propranolol', 'Telmisartan'}

Indications not predicted: {'Bendroflumethiazide', 'Guanabenz', 'Guanfacine', 'Olmesartan', 'Mino

In [172]:
# 2. epilepsy DOID:1826 (25 drugs)
print('Indications: %s' % len(indications_df.query('doid_id == "DOID:1826"')))

# predictions
print('True predictions: %s (%s)' % (len(predictions_df.query('disease == "DOID_1826" & predicted == "true"')),round(len(predictions_df.query('disease == "DOID_1826" & predicted == "true"'))*100/len(predictions_df.query('disease == "DOID_1826"')))))

# overlap
indi = set(indications_df.query('doid_id == "DOID:1826"')[['drug']].drug)
pred = set(predictions_df.query('disease == "DOID_1826" & predicted == "true"')[['drug_name']].drug_name)
overlap = pred & indi
print('Indications predicted: {}\n'.format(len(overlap)))
print(overlap)
print('\nIndications not predicted: {}'.format(indi - pred))

Indications: 25
True predictions: 1092 (71)
Indications predicted: 19

{'Trimethadione', 'Amobarbital', 'Pregabalin', 'Rufinamide', 'Fosphenytoin', 'Clobazam', 'Levetiracetam', 'Midazolam', 'Clonazepam', 'Gabapentin', 'Felbamate', 'Acetazolamide', 'Topiramate', 'Lacosamide', 'Phenobarbital', 'Vigabatrin', 'Valproic Acid', 'Propofol', 'Diazepam'}

Indications not predicted: {'Carbamazepine', 'Phenytoin', 'Primidone', 'Lamotrigine', 'Oxcarbazepine', 'Zonisamide'}


In [173]:
# 3. malaria DOID:12365 (11 drugs)
print('Indications: %s' % len(indications_df.query('doid_id == "DOID:12365"')))

# predictions
print('True predictions: %s (%s)' % (len(predictions_df.query('disease == "DOID_12365" & predicted == "true"')),round(len(predictions_df.query('disease == "DOID_12365" & predicted == "true"'))*100/len(predictions_df.query('disease == "DOID_12365"')))))

# overlap
indi = set(indications_df.query('doid_id == "DOID:12365"')[['drug']].drug)
pred = set(predictions_df.query('disease == "DOID_12365" & predicted == "true"')[['drug_name']].drug_name)
overlap = pred & indi
print('Indications predicted: {}\n'.format(len(overlap)))
print(overlap)
print('\nIndications not predicted: {}'.format(indi - pred))

Indications: 11
True predictions: 987 (64)
Indications predicted: 11

{'Quinine', 'Dapsone', 'Clindamycin', 'Quinidine barbiturate', 'Artemether', 'Sulfadiazine', 'Doxycycline', 'Proguanil', 'Quinidine', 'Pyrimethamine', 'Pentoxifylline'}

Indications not predicted: set()


In [174]:
# 4. thyroid cancer DOID:1781 (4 drugs)
print('Indications: %s' % len(indications_df.query('doid_id == "DOID:1781"')))

# predictions
print('True predictions: %s (%s)' % (len(predictions_df.query('disease == "DOID_1781" & predicted == "true"')),round(len(predictions_df.query('disease == "DOID_1781" & predicted == "true"'))*100/len(predictions_df.query('disease == "DOID_1781"')))))

# overlap
indi = set(indications_df.query('doid_id == "DOID:1781"')[['drug']].drug)
pred = set(predictions_df.query('disease == "DOID_1781" & predicted == "true"')[['drug_name']].drug_name)
overlap = pred & indi
print('Indications predicted: {}\n'.format(len(overlap)))
print(overlap)
print('\nIndications not predicted: {}'.format(indi - pred))

Indications: 4
True predictions: 964 (63)
Indications predicted: 4

{'Epirubicin', 'Vandetanib', 'Doxorubicin', 'Sorafenib'}

Indications not predicted: set()


In [175]:
# 5. obesity DOID:9970 (11 drugs)
print('Indications: %s' % len(indications_df.query('doid_id == "DOID:9970"')))

# predictions
print('True predictions: %s (%s)' % (len(predictions_df.query('disease == "DOID_9970" & predicted == "true"')),round(len(predictions_df.query('disease == "DOID_9970" & predicted == "true"'))*100/len(predictions_df.query('disease == "DOID_9970"')))))

# overlap
indi = set(indications_df.query('doid_id == "DOID:9970"')[['drug']].drug)
pred = set(predictions_df.query('disease == "DOID_9970" & predicted == "true"')[['drug_name']].drug_name)
overlap = pred & indi
print('Indications predicted: {}\n'.format(len(overlap)))
print(overlap)
print('\nIndications not predicted: {}'.format(indi - pred))

Indications: 11
True predictions: 1062 (69)
Indications predicted: 8

{'Orlistat', 'Methamphetamine', 'Phenylpropanolamine', 'Phentermine', 'Topiramate', 'Diethylpropion', 'Cimetidine', 'Sibutramine'}

Indications not predicted: {'Phendimetrazine', 'Benzphetamine', 'Bupropion'}


### Discussion

In almost all cases 

### NGLY1 deficiency DOID:0060728

In [185]:
# predictions
print('True predictions: %s\n' % len(predictions_df.query('disease == "DOID_0060728" & predicted == "true"')))

pred = set(predictions_df.query('disease == "DOID_0060728" & predicted == "true"')[['drug_name']].drug_name)
print('Predictions: {}'.format(pred))
predictions_df.query('disease == "DOID_0060728" & predicted == "true"')

True predictions: 158

Predictions: {'Eltrombopag', 'Isoniazid', 'Porfimer', 'Dextrothyroxine', 'Sulpiride', 'Cefpiramide', 'Trabectedin', 'Chlorzoxazone', 'Vancomycin', 'Ifosfamide', 'Ramelteon', 'Rifaximin', 'Stepronin', 'Remifentanil', 'Radium Ra 223 Dichloride', 'Fludiazepam', 'Bethanechol', 'Thiotepa', 'Vilazodone', 'Gestodene', 'Telithromycin', 'Vigabatrin', 'Orciprenaline', 'L-Lysine', 'Arbutamine', 'Calcium Chloride', 'Magnesium Sulfate', 'Bentoquatam', 'Minocycline', 'Digoxin', 'Fidaxomicin', 'Hexylcaine', 'Tazarotene', 'Remoxipride', 'Glycerol Phenylbutyrate', 'Mitomycin', 'Nonoxynol-9', 'Ethinamate', 'Deserpidine', 'Silver sulfadiazine', 'Ketazolam', 'Permethrin', 'Dimercaprol', 'Mazindol', 'Novobiocin', 'Azilsartan medoxomil', 'Methyl aminolevulinate', 'Rifampicin', 'Estropipate', 'Cisapride', 'Cefradine', 'Lucanthone', 'Sucralfate', 'Clodronate', 'Methacycline', 'Halothane', 'Dyphylline', 'Cephaloglycin', 'Latanoprost', 'Sulfamethazine', 'Azapropazone', 'Ibutilide', 'Niflu

Unnamed: 0,drug,disease,predicted,prediction,drug_name,drug_categories,disease_name,disease_pathophysiology
8630,DB00651,DOID_0060728,true,1.000,Dyphylline,Phosphodiesterase Inhibitors|Bronchodilator Ag...,,
10137,DB00816,DOID_0060728,true,1.000,Orciprenaline,Sympathomimetics|Adrenergic beta-2 Receptor Ag...,,
13836,DB00773,DOID_0060728,true,0.629,Etoposide,"Antineoplastic Agents, Phytogenic",,
15343,DB00544,DOID_0060728,true,0.999,Fluorouracil,Immunosuppressive Agents|Antimetabolites|Antim...,,
15754,DB00441,DOID_0060728,true,0.673,Gemcitabine,Antiviral Agents|Immunosuppressive Agents|Enzy...,,
17398,DB04572,DOID_0060728,true,0.994,Thiotepa,,,
24522,DB01080,DOID_0060728,true,0.838,Vigabatrin,Enzyme Inhibitors|Anticonvulsants|GABA Agents,,
25207,DB01181,DOID_0060728,true,0.744,Ifosfamide,,,
27125,DB00654,DOID_0060728,true,0.868,Latanoprost,,,
27536,DB01214,DOID_0060728,true,0.723,Metipranolol,Antihypertensive Agents|Sympatholytics|Anti-Ar...,,
