In [2]:
import pandas as pd
import os

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.metrics import matthews_corrcoef

import altair as alt

In [13]:
### making an assumption that "target actual is the binary value"

eval_results = {}

datasets = ["bace", "clintox", "deepchem_Lipophilicity", "HIV", "sol_del", "tox21"]

data_map={
    'HIV.csv': {'target':'HIV_active','structure':'smiles'},
    'bace.csv':{'target':'active','structure':'mol'},
    'tox21.csv':{'target':'NR-AhR','structure':'smiles'},
    'clintox.csv':{'target':'CT_TOX','structure':'smiles'},
    'sol_del.csv':{'target':'binned_sol','structure':'smiles'},
    'deepchem_Lipophilicity.csv':{'target':'drug_like','structure':'smiles'}   
}

pred_threshold = 0.5

for file in os.scandir(os.path.join("Complex_Models", "Predictions")):
    target = [data_map[dataset]['target'] for dataset in data_map if dataset[:-4] in file.name][0]
    dataset = [dataset for dataset in data_map if dataset[:-4] in file.name][0]
    df = pd.read_csv(file.path)
    y_actual = df["target_actual"].to_list()
    y_pred = df[target].apply(lambda x: 1 if x > pred_threshold else 0).to_list()

    eval_results[file.name] = {
        "dataset" : dataset,
        "accuracy_score": accuracy_score(y_actual, y_pred),
        "f1_score": f1_score(y_actual, y_pred),
        "roc_auc_score": roc_auc_score(y_actual, y_pred),
        "log_loss": log_loss(y_actual, y_pred),
        "matthews_corrcoef": matthews_corrcoef(y_actual, y_pred)
    }
    
df = pd.DataFrame(eval_results).T
df = df.reset_index()
df = df.rename(columns = {"index": "result_file"})
df['split_id'] = df['result_file'].apply(lambda x: "train" if "train" in x else "validate")
df['split_method'] = df['result_file'].apply(lambda x: "custer" if "cluster" in x else "random")
df['dataset'] = df['dataset'].apply(lambda x: "deepchem_lipophilicity.csv" if x == "deepchem_Lipophilicity.csv" else x)
df  

Unnamed: 0,result_file,dataset,accuracy_score,f1_score,roc_auc_score,log_loss,matthews_corrcoef,split_id,split_method
0,bace-cluster-train-pred-hyperopt.csv,bace.csv,0.986791,0.989994,0.982927,0.456231,0.970685,train,custer
1,bace-cluster-train-pred.csv,bace.csv,0.931624,0.948417,0.919875,2.361658,0.847389,train,custer
2,bace-cluster-validate-pred-hyperopt.csv,bace.csv,0.761062,0.847458,0.635542,8.252759,0.315091,validate,custer
3,bace-cluster-validate-pred.csv,bace.csv,0.699115,0.802326,0.582665,10.392336,0.176219,validate,custer
4,bace-random-train-pred.csv,bace.csv,0.987558,0.99073,0.981974,0.42973,0.972117,train,random
5,bace-random-validate-pred.csv,bace.csv,0.845815,0.892966,0.791892,5.32545,0.627845,validate,random
6,clintox-cluster-train-pred.csv,clintox.csv,0.980907,0.882353,0.924665,0.659457,0.87242,train,custer
7,clintox-cluster-validate-pred.csv,clintox.csv,0.968326,0.0,0.5,1.093988,0.0,validate,custer
8,clintox-random-train-pred.csv,clintox.csv,0.984076,0.901961,0.938835,0.549986,0.893502,train,random
9,clintox-random-validate-pred.csv,clintox.csv,0.932432,0.285714,0.664136,2.333737,0.260579,validate,random


In [14]:
# pvt = pd.pivot(df, index = ["dataset", "split_method"], columns = ['split_id'], values = ["accuracy_score", "f1_score", "roc_auc_score", "log_loss"])

# pvt.columns.names = [None, None]

# pvt

In [15]:
# need to come back and optimize the visualizations, but thinking something like multiple facets for each score type
# I also need to rotate text to avoid overlap

vis_df = df.melt(['result_file', 'dataset', 'split_id', 'split_method'], value_name = "score", var_name= "metric")

vis_df['dataset'] = vis_df['dataset'].str[:-4]
vis_df['dataset'] = vis_df['dataset'].str.replace('deepchem_', '')


vis_df.sample(5)


Unnamed: 0,result_file,dataset,split_id,split_method,metric,score
44,sol_del-cluster-train-pred.csv,sol_del,train,custer,f1_score,0.993151
90,deepchem_Lipophilicity-random-train-pred.csv,lipophilicity,train,random,log_loss,0.067725
33,clintox-cluster-validate-pred.csv,clintox,validate,custer,f1_score,0.0
20,sol_del-random-train-pred.csv,sol_del,train,random,accuracy_score,0.995825
35,clintox-random-validate-pred.csv,clintox,validate,random,f1_score,0.285714


In [16]:
stack = None
chart = None

for i, metric in enumerate(vis_df['metric'].unique()):
    for j, split_type in enumerate(vis_df['split_method'].unique()):
        chart_title = None
        y_title = None
        x_labels = False

        if i == 0: 
            chart_title = "\n" + split_type + " split"

        if i ==0 and j == 0:
            chart_title = ["Summary of All Chemprop Model Results", "", chart_title]
        
        if i == len(vis_df['metric'].unique()) - 1:
            x_labels = True

        if j == 0: y_title = metric

        plot = alt.Chart(vis_df[(vis_df['metric'] == metric) & (vis_df['split_method'] == split_type)]
        ).mark_bar(opacity = 0.7).encode(
                y = alt.Y("score:Q", title = y_title),
                x = alt.X("split_id:N", title= None, axis = alt.Axis(labels = x_labels)),
                color = alt.Color("split_id:N", title = None),
                column =  alt.Row("dataset:N", title = chart_title, header=alt.Header(titleFontSize=14))
                ).properties(width = 30, height = 100)

    
        if not stack:
            stack = plot
        else:
            stack = alt.hconcat(
                stack,
                plot).resolve_scale(
                    y='shared')
            # stack = stack | plot
            # stack.resolve_scale(y = 'shared')

    if not chart:
            chart = stack
    else:
        chart = chart & stack
    stack = None

chart


In [17]:
### write out dataframe to csv for later comparison with simple model results
vis_df["Model"] = 'Chemprop'
vis_df.to_csv(os.path.join("Evaluation", "Chemprop_Results.csv"), index = False)

In [18]:
### Now we just need to pull together the model train time info and look at this

### extract tuples of dataset names and log file paths
log_paths = [( x.name, os.path.join(x.path, "quiet.log")) for x in os.scandir("Complex_Models") if x.name != "Predictions"]
time_data = {
    "model_dataset": {},
    "elapsed_time (hh:mm:ss)": {}
}

i = 0
for dataset, path in log_paths:
    with open(path, 'r') as f:
        elapsed_time = f.readlines()[-1].split('=')[-1].strip()
    
    time_data["model_dataset"][i] = dataset
    time_data["elapsed_time (hh:mm:ss)"][i] = elapsed_time
    
    i += 1

time_df = pd.DataFrame(time_data)

time_df.to_csv(os.path.join("evaluation", "Chemprop_train_time.csv"), index=False)
time_df

### I want to add dataset size in here for some additional context
### Maybe even normalize train time by dataset size?

Unnamed: 0,model_dataset,elapsed_time (hh:mm:ss)
0,bace,0:03:51
1,bace-cluster,0:04:02
2,bace-cluster-hyperopt,0:05:16
3,bace-random,0:04:04
4,clintox-cluster,0:03:22
5,clintox-random,0:03:21
6,deepchem_Lipophilicity-cluster,0:09:10
7,deepchem_Lipophilicity-random,0:09:33
8,HIV-cluster,4:58:22
9,HIV-random,4:54:50
