In [1]:
import torch
import pandas as pd
from dgl.data.utils import load_graphs
import numpy as np
import dgl
from list_dataset import PDBbindDataset
from torch.utils.data import DataLoader
import dgl
import numpy as np
import torch
from options import prepare_train_args
import argparse
import sys
from fcn import ECIF_GNN

from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
from math import sqrt
from scipy import stats

An experimental feature for CUDA allocations is turned on for better allocation
pattern resulting in better memory usage for minibatch GNN training workloads.
See https://pytorch.org/docs/stable/notes/cuda.html#optimizing-memory-usage-with-pytorch-cuda-alloc-conf,
and set the environment variable `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:False`



In [2]:
path_model='./pth/Downstream-test.pth'
model=ECIF_GNN(dropout=0)
model.load_state_dict(torch.load(path_model),False)
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# CASF_2016

In [3]:
CASF_2016='./data/CASF_2016'
ids_CASF_2016="%s/out_id_CASF_2016_5A.npy"%(CASF_2016)
ligs_CASF_2016="%s/out_ligand_CASF_2016_5A.bin"%(CASF_2016)
prots_CASF_2016="%s/out_protein_CASF_2016_5A.bin"%(CASF_2016)

pdbids = np.load(ids_CASF_2016)
graphsl= load_graphs(ligs_CASF_2016)
graphsp= load_graphs(prots_CASF_2016)
graphsl = graphsl[0]
graphsp = graphsp[0]

prediction=[]
for index,i in enumerate(pdbids):
    bgl = dgl.batch([graphsl[index]])
    bgp = dgl.batch([graphsp[index]])
    bgl=bgl.to(device)
    bgp=bgp.to(device)
    model.to(device)
    model.eval()
    output_2,output_3,output_0,output_1,pred,_,_=model(bgl, bgp,bgl, bgp)
    prediction.append(float(pred))  
CASF_2016_target='./data/CASF_2016/CoreSet.dat'
contents = []
with open(CASF_2016_target, 'r') as f:
    for line in f.readlines():
        if line[0] != "#":
            splitted_elements = line.split()
            if len(splitted_elements) == 6:
                contents.append(splitted_elements[:5] + splitted_elements[6:])
            else:
                pass
df = pd.DataFrame(contents, columns=(
                'pdb_code', 'resl', 'release_year',
                'logKa', 'Ka'))
df.head()
df.set_index('pdb_code',inplace = True)

dataframe = pd.DataFrame()
dataframe['pdbids']=pdbids
dataframe['prediction']=prediction
dataframe.set_index('pdbids',inplace = True)
dataframe.head()

LogKas=[]
for i in pdbids:
    logKa=df.loc[i,'logKa']
    LogKas.append(logKa)
dataframe['logKa']=LogKas

LogKas = list(map(float, LogKas))
prediction = [round(i,2) for i in prediction]
Pearsonr= "{:.3f}".format(stats.pearsonr(LogKas,prediction)[0])
print('Pearson correlation:',Pearsonr)
r2=r2_score(dataframe['logKa'],dataframe['prediction'])
print('R2',r2)

MSE=mean_squared_error(dataframe['logKa'],dataframe['prediction'])
print("MSE:",MSE)
RMSE="{:.3f}".format(sqrt(MSE))
print("RMSE:",RMSE)
MAE="{:.4f}".format(mean_absolute_error(dataframe['logKa'],dataframe['prediction']))
print('MAE:',MAE)

Pearson correlation: 0.842
R2 0.6990563723980021
MSE: 1.4177247383460598
RMSE: 1.191
MAE: 0.9558


# CASF_2013

In [4]:
model=ECIF_GNN(dropout=0)
model.load_state_dict(torch.load(path_model),False)


CASF_2013='./data/CASF_2013'
ids_CASF_2013="%s/out_id_CASF_2013_5A.npy"%(CASF_2013)
ligs_CASF_2013="%s/out_ligand_CASF_2013_5A.bin"%(CASF_2013)
prots_CASF_2013="%s/out_protein_CASF_2013_5A.bin"%(CASF_2013)

pdbids = np.load(ids_CASF_2013)
graphsl= load_graphs(ligs_CASF_2013)
graphsp= load_graphs(prots_CASF_2013)
graphsl = graphsl[0]
graphsp = graphsp[0]

prediction=[]
for index,i in enumerate(pdbids):
    bgl = dgl.batch([graphsl[index]])
    bgp = dgl.batch([graphsp[index]])
    bgl=bgl.to(device)
    bgp=bgp.to(device)
    model.to(device)
    model.eval()
    output_2,output_3,output_0,output_1,pred,_,_=model(bgl, bgp,bgl, bgp)
    prediction.append(float(pred))
    
    
CASF_2013_target='./data/CASF_2013/INDEX_core_data.2013'
contents = []
with open(CASF_2013_target, 'r') as f:
    for line in f.readlines():
        if line[0] != "#":
            splitted_elements = line.split()
            if len(splitted_elements) == 8:
                contents.append(splitted_elements[:5] + splitted_elements[6:])
            else:
                pass
df = pd.DataFrame(contents, columns=(
                'pdb_code', 'resolution', 'release_year',
                '-logKd/Ki', 'Kd/Ki', 'reference', 'ligand_name'))
df.set_index('pdb_code',inplace = True)

dataframe = pd.DataFrame()
dataframe['pdbids']=pdbids
dataframe['prediction']=prediction
dataframe.set_index('pdbids',inplace = True)

LogKas=[]
for i in pdbids:
    logKa=df.loc[i,'-logKd/Ki']
    LogKas.append(logKa)
dataframe['-logKd/Ki']=LogKas

LogKas = list(map(float, LogKas))
prediction = [round(i,2) for i in prediction]
Pearsonr= "{:.3f}".format(stats.pearsonr(LogKas,prediction)[0])
print('Pearson correlation:',Pearsonr)
r2=r2_score(dataframe['-logKd/Ki'],dataframe['prediction'])
print('R2',r2)

MSE=mean_squared_error(dataframe['-logKd/Ki'],dataframe['prediction'])
print("MSE:",MSE)
RMSE="{:.3f}".format(sqrt(MSE))
print("RMSE:",RMSE)
MAE="{:.4f}".format(mean_absolute_error(dataframe['-logKd/Ki'],dataframe['prediction']))
print('MAE:',MAE)

Pearson correlation: 0.815
R2 0.6504438405297757
MSE: 1.7982874405871023
RMSE: 1.341
MAE: 1.1115
