In [None]:
import argparse
import os
import pickle
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, matthews_corrcoef, average_precision_score

from utils import ecfp


def get_predictions(chid, n_estimators=100, n_jobs=8, n_runs=10):
    # read data and calculate ecfp fingerprints
    assay_file = f'./assays/processed/{chid}.csv'
    print(f'Reading data from: {assay_file}')
    df = pd.read_csv(assay_file)
    X = np.array(ecfp(df.smiles))
    y = np.array(df.label)

    
    metrics = defaultdict(list)
    for i in range(n_runs):
        X1, X2, y1, y2 = train_test_split(X, y, test_size=0.5, stratify=y)
        clf = RandomForestClassifier(
            n_estimators=n_estimators, n_jobs=n_jobs)
        clf.fit(X1, y1)
        
        y_pred = clf.predict_proba(X2)[:, 1]
        
        metrics['ROC AUC'].append(roc_auc_score(y2, y_pred))
#         metrics['matthews_corrcoef'].append(matthews_corrcoef(y2, y_pred>0.5))
        metrics['AP'].append(average_precision_score(y2, y_pred))
        metrics['Fraction positive'].append(y2.mean())
    return metrics

In [None]:
dir_results ='./results/goal_directed_paper'
chids = os.listdir(os.path.join(dir_results, 'graph_ga'))

all_metrics = {}
for chid in chids:
    metrics = get_predictions(chid)
    all_metrics[chid] = metrics
       
    for name, values in metrics.items():
        print(f"{name}: {np.mean(values)}")

Reading data from: ./assays/processed/CHEMBL1909140.csv
ROC AUC: 0.8531470159278104
AP: 0.5272357535818218
Fraction positive: 0.07007125890736342
Reading data from: ./assays/processed/CHEMBL3888429.csv
ROC AUC: 0.7844886363636363
AP: 0.4434026928686059
Fraction positive: 0.20958083832335328
Reading data from: ./assays/processed/CHEMBL1909203.csv
ROC AUC: 0.7154551122194514
AP: 0.15249578045341433
Fraction positive: 0.047505938242280284


In [None]:
proc = []

for assay, metrics in all_metrics.items():
    entry = {"AssayID": assay}
    
    entry.update({f"{k}_mean": np.mean(v) for k,v in metrics.items()})
    entry.update({f"{k}_std": np.std(v) for k,v in metrics.items()})
    proc.append(entry)

df = pd.DataFrame(proc)
del df['Fraction positive_std']
df = df.rename(columns={'Fraction positive_mean': 'Fraction positive'})
df

Unnamed: 0,AssayID,ROC AUC_mean,AP_mean,Fraction positive,ROC AUC_std,AP_std
0,CHEMBL1909140,0.853147,0.527236,0.070071,0.028708,0.085707
1,CHEMBL3888429,0.784489,0.443403,0.209581,0.027934,0.046769
2,CHEMBL1909203,0.715455,0.152496,0.047506,0.079936,0.052771


In [None]:
df.to_csv(os.path.join(dir_results, 'performance.csv'))

In [None]:
for mean_col in [col for col in df.columns if col.endswith('_mean')]:
    print(mean_col)

ROC AUC_mean
matthews_corrcoef_1_mean
matthews_corrcoef_2_mean
matthews_corrcoef_5_mean
matthews_corrcoef_7_mean
AP_mean
Fraction positive_mean
