## Top predictions analysis

In [1]:
import pandas

In [2]:
# open top predictions file
top_pred_df = pandas.read_excel('predictions/top-predictions.xlsx')
top_pred_df.head(2)

Unnamed: 0,compound_name,disease_name,category,prediction,compound_percentile,disease_percentile,prior_prob,n_trials
0,Pamidronate,osteoporosis,DM,0.886899,1.0,1.0,0.03893,0
1,Alendronate,osteoporosis,DM,0.884991,1.0,0.99935,0.03893,68


In [3]:
# Number of top predictions
len(top_pred_df)

3980

In [4]:
# open all predictions file
pred_df = pandas.read_table('predictions/probabilities.tsv')
#pred_df.head(2)
pred_df.sort_values('prediction', ascending=False).head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,category,status,prior_prob,prediction,training_prediction,compound_percentile,disease_percentile,n_trials,status_trials,status_drugcentral
139518,DB00282,Pamidronate,DOID:11476,osteoporosis,DM,1,0.03893,0.886899,0.977108,1.0,1.0,,,
138453,DB00630,Alendronate,DOID:11476,osteoporosis,DM,1,0.03893,0.884991,0.976682,1.0,0.99935,,,


In [5]:
# Number of total predictions
len(pred_df)

209168

In [6]:
# Number of predictions with status = 1 in all predictions dataset
len(pred_df[pred_df.status == 1])

755

In [7]:
# add compound and disease IDs and status vectors into top_pred_df
pred_df = pred_df[['compound_id','compound_name','disease_id', 'disease_name', 'status']]
pred_df.head(2)

Unnamed: 0,compound_id,compound_name,disease_id,disease_name,status
0,DB01048,Abacavir,DOID:10652,Alzheimer's disease,0
1,DB05812,Abiraterone,DOID:10652,Alzheimer's disease,0


In [8]:
top_id_df = pandas.merge(top_pred_df, pred_df, how='left', on=['compound_name','disease_name'])
top_id_df.head(2)

Unnamed: 0,compound_name,disease_name,category,prediction,compound_percentile,disease_percentile,prior_prob,n_trials,compound_id,disease_id,status
0,Pamidronate,osteoporosis,DM,0.886899,1.0,1.0,0.03893,0,DB00282,DOID:11476,1
1,Alendronate,osteoporosis,DM,0.884991,1.0,0.99935,0.03893,68,DB00630,DOID:11476,1


In [9]:
# reorder columns
cols = top_id_df.columns.tolist()
cols = [cols[0] , cols[8] , cols[1] , cols[9] , cols[10]] + cols[2:7]
cols
top_id_df = top_id_df[cols]
top_id_df.head(2)

Unnamed: 0,compound_name,compound_id,disease_name,disease_id,status,category,prediction,compound_percentile,disease_percentile,prior_prob
0,Pamidronate,DB00282,osteoporosis,DOID:11476,1,DM,0.886899,1.0,1.0,0.03893
1,Alendronate,DB00630,osteoporosis,DOID:11476,1,DM,0.884991,1.0,0.99935,0.03893


In [10]:
# save df
with open('./top-predictions/top-predictions.tsv', 'w') as write_file:
    top_id_df.to_csv(write_file, float_format='%.5g', sep='\t', index=False)

In [11]:
# DOID - replace ':' by '_'
top_df = top_id_df
top_df['disease_id'] = top_df.disease_id.str.replace(':','_')
top_df.head(2)

Unnamed: 0,compound_name,compound_id,disease_name,disease_id,status,category,prediction,compound_percentile,disease_percentile,prior_prob
0,Pamidronate,DB00282,osteoporosis,DOID_11476,1,DM,0.886899,1.0,1.0,0.03893
1,Alendronate,DB00630,osteoporosis,DOID_11476,1,DM,0.884991,1.0,0.99935,0.03893


In [12]:
# save df
with open('./top-predictions/top-predictions.tsv', 'w') as write_file:
    top_id_df.to_csv(write_file, float_format='%.5g', sep='\t', index=False)

## Statistics

### Top predictions dataset

In [13]:
# Number of top predictions
len(top_df)

3980

In [14]:
# percentage to all predictions
print('{}%'.format(round(len(top_df)*100/len(pred_df))))

2%


In [15]:
# Number of predictions with status = 1 in top predictions dataset
len(top_df[top_df.status == 1])

586

In [16]:
# percentage of status = 1 to all predictions
print('{}%'.format(round(len(top_df[top_df.status == 1])*100/len(pred_df[pred_df.status == 1]))))

78%


In [17]:
top_df[top_df.status != 1].head(10)

Unnamed: 0,compound_name,compound_id,disease_name,disease_id,status,category,prediction,compound_percentile,disease_percentile,prior_prob
4,Ibandronate,DB00710,Paget's disease of bone,DOID_5408,0,,0.802573,1.0,1.0,0.007231
15,Olsalazine,DB01250,Crohn's disease,DOID_8778,0,,0.665331,1.0,1.0,0.007231
18,Paricalcitol,DB00910,osteoporosis,DOID_11476,0,,0.603481,1.0,0.996749,0.0
20,Ethotoin,DB00754,epilepsy syndrome,DOID_1826,0,,0.588531,1.0,0.99935,0.0
22,Quazepam,DB01589,epilepsy syndrome,DOID_1826,0,,0.569974,1.0,0.9987,0.0
23,Alprazolam,DB00404,epilepsy syndrome,DOID_1826,0,,0.564904,1.0,0.998049,0.0
24,Paricalcitol,DB00910,psoriasis,DOID_8893,0,,0.564575,0.992647,1.0,0.0
25,Zidovudine,DB00495,hepatitis B,DOID_2043,0,,0.56046,1.0,1.0,0.004753
26,Tafluprost,DB08819,glaucoma,DOID_1686,0,,0.555917,1.0,1.0,0.0
29,Carboprost Tromethamine,DB00429,glaucoma,DOID_1686,0,,0.551495,1.0,0.99935,0.0


In [None]:
# distribution of 