In [None]:
import numpy as np
from scipy.stats import pearsonr, spearmanr
import pandas as pd
import matplotlib.pyplot as plt
import glob

In [None]:
fp = 'path_to_your_data.csv'

In [None]:
df = pd.read_csv(fp)
test_set_values = df[df['split'] == 'test']['affinity'].values

In [None]:
model_results = glob.glob('test-result*.txt')
fig, ax = plt.subplots(len(model_results), 1, figsize=(5, 20))
i = 0

for model_result in model_results:
    model_param = model_result.split('-')[2], model_result.split('-')[3].split('.')[0]
    print(model_param)
    result = np.loadtxt(model_result)
    print(len(result), test_set_values.shape)
    ax[i].scatter(test_set_values, result)
    ax[i].set_title('Protein kernel: {}, Ligand kernel: {}'.format(model_param[0], model_param[1]))
    ax[i].set_xlabel('True affinity')
    ax[i].set_ylabel('Predicted affinity')
    ax[i].plot([0,12], [0,12], 'k--', lw=4)
    i += 1
    # calculate the correlation coefficient
    print("Pearson correlation coefficient: {}".format(pearsonr(test_set_values, result)[0]))
    print("Spearman correlation coefficient: {}".format(spearmanr(test_set_values, result)[0]))
    print("-" * 20)

plt.show()

In [None]:
#load a held-out test set

from rdkit import Chem

other_fp = 'file_to_other_data.csv'
df2 = pd.read_csv(other_fp).dropna()

In [None]:
def convert_to_isomeric(smiles):
    """
    convert a smile string to an isomeric smile string
    """
    m = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(m, isomericSmiles=True)

In [None]:
# convert the smiles to isomeric smiles
df2['ligands'] = df2['smiles'].apply(convert_to_isomeric)

In [None]:
# load the best DeepDTA model to predict for these test sets
from model import DeepDTA
import torch, json
from torchsummary import summary


# convert the smiles to one-hot encoding; CHANGE TO YOUR OWN PATH OF YOUR BEST MODEL
ligand_dict = json.load(open('ligand_dict.json'))
protein_dict = json.load(open('protein_dict.json'))
smilelen, seqlen = 200, 2000

# load model
model = DeepDTA(len(protein_dict)+1, len(ligand_dict)+1, 32, 8, 8)
model.load_state_dict(torch.load('deepdta_retrain.pt'))
model.eval()

df2_result = []
for i in range(len(df2)):
    ligand = df2.iloc[i]['ligands']
    protein = df2.iloc[i]['proteins']
    protein = [protein_dict[x] for x in protein] + [protein_dict['dummy']] * (seqlen - len(protein))
    ligand = [ligand_dict[x] for x in ligand] + [ligand_dict['dummy']] * (smilelen - len(ligand))
    ligand = torch.tensor(ligand).unsqueeze(0)
    protein = torch.tensor(protein).unsqueeze(0)
    with torch.no_grad():
        result = model(protein, ligand)
    df2_result.append(result.item())

df2_result = np.array(df2_result)
ground_truth = df2['affinity'].values

In [None]:
plt.scatter(ground_truth, df2_result)
plt.plot([4,8], [4,8], 'k--', lw=4)
print("Pearson correlation coefficient: {}".format(pearsonr(ground_truth, df2_result)[0]))
print("Spearman correlation coefficient: {}".format(spearmanr(ground_truth, df2_result)[0]))