In [1]:
import sys

import pandas as pd
import numpy as np
from math import sqrt
from sklearn.metrics import roc_auc_score

def get_csv_results_reg(predict_path, csv_path):
    """
    Construct both mean aggregated df and regular df of smi_name, predict value and target value for a regression task.
    Aggregation is based on smi_name.
    """
    predict = pd.read_pickle(predict_path)
    smi_list, predict_list, target_list = [], [], []
    for batch in predict:
        sz = batch["bsz"]
        for i in range(sz):
            yhat = batch["predict"][i].cpu()
            y = batch["target"][i].cpu()
            
            smi_list.append(batch["smi_name"][i])
            predict_list.append(yhat.detach().item())
            target_list.append(y.detach().item())
            
    predict_df = pd.DataFrame({"SMILES": smi_list, "predict": predict_list, "target": target_list})
    predict_df_agg = predict_df.groupby("SMILES").mean()

    predict_df.to_csv(csv_path,index=False)
    predict_df_agg.to_csv(csv_path.replace("test", "test_agg"),index=False)

    return predict_df, predict_df_agg

In [132]:
def get_csv_results_cf(predict_path, csv_path):
    predict = pd.read_pickle(predict_path)
    smi_list, prob_list, target_list = [], [], []

    atoms_counter = 0
    for batch in predict:
        sz = batch["bsz"]

        y = batch["target"].cpu().reshape_as(batch["prob"].cpu())
        y_hat = batch["prob"].cpu() 
        print(y)
        roc_auc_score(y, y_hat)
        
        
        sys.exit()
        #for i in range(sz):
        #    
        #    prob = batch["prob"][i].cpu()
        #    t = batch["target"][i].cpu()
        #    print(roc_auc_score(t, prob))
        #    sys.exit()

            #smi_list.append(batch["smi_name"][i])
            #predict_list.append(yhat.detach().item())
            #target_list.append(y.detach().item())
        
        atoms_counter += sz

            
dataset = "clintox"
predict_path = f"/workspace/Uni-Mol/unimol/results/{dataset}_test.out.pkl"
csv_path = f"/workspace/Uni-Mol/unimol/results/{dataset}_test.out.csv"
get_csv_results_cf(predict_path, csv_path)

tensor([[1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0]])


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [3]:
#datasets = ["freesolv_no_hydrogen", "esol", "lipo", "qm7_no_hydrogen"]
#datasets = ["freesolv_no_hydrogen", "freesolv_no_hydrogen2"]
#datasets = ["freesolv", "freesolv2", "freesolv3"]
datasets = ["freesolv32bit", "freesolv32bit_grad"]

for dataset in datasets:
    predict_path = f"/workspace/Uni-Mol/unimol/results/{dataset}_test.out.pkl"
    csv_path = f"/workspace/Uni-Mol/unimol/results/{dataset}_test.out.csv"
    predict_df, predict_df_agg = get_csv_results_reg(predict_path, csv_path)
    if dataset not in ["qm7", "qm7_no_hydrogen"] : 
        print(f"{dataset} RMSE: {sqrt(((predict_df['predict'] - predict_df['target']) ** 2).mean()):.4}")
        print(f"{dataset} Aggregated RMSE: {sqrt(((predict_df_agg['predict'] - predict_df_agg['target']) ** 2).mean()):.4}")
    else:
        print(f"{dataset} MAE: {np.abs(predict_df['predict'] - predict_df['target']).mean():.4}")
        print(f"{dataset} Aggregated MAE: {np.abs(predict_df_agg['predict'] - predict_df_agg['target']).mean():.4}")

freesolv32bit RMSE: 1.86
freesolv32bit Aggregated RMSE: 1.845
freesolv32bit_grad RMSE: 1.655
freesolv32bit_grad Aggregated RMSE: 1.619


In [92]:
import lmdb
import numpy as np
import os
import sys
import pickle

def read_lmdb(lmdb_path):
    env = lmdb.open(
        lmdb_path,
        subdir=False,
        readonly=True,
        lock=False,
        readahead=False,
        meminit=False,
        max_readers=256,
    )
    txn = env.begin()
    keys = list(txn.cursor().iternext(values=False))

    targets = []
    smi = []
    #print(f"#atoms: {len(keys)}")
    for idx in keys:
        datapoint_pickled = txn.get(idx)
        data = pickle.loads(datapoint_pickled)
        targets.append(data["target"])
        smi.append(data['smi'])
    
    return targets, smi
    
        

path = "/workspace/Uni-Mol/unimol/data/molecular_property_prediction/clintox/test.lmdb"
targets, smi = read_lmdb(path)

print(targets)


[(1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (0, 1), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0), (1, 0),