In [None]:
import numpy as np
from scipy.stats import pearsonr, spearmanr
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem

In [None]:
fp = 'file_to_other_data.csv'
df = pd.read_csv(fp).dropna()

In [None]:
def convert_to_isomeric(smiles):
    """
    convert a smile string to an isomeric smile string
    """
    m = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(m, isomericSmiles=True)

In [None]:
# convert the smiles to isomeric smiles
df['ligands'] = df['smiles'].apply(convert_to_isomeric)

In [None]:
# load the best DeepDTA model to predict for these test sets
from model import DeepDTA
import torch, json
from torchsummary import summary


# convert the smiles to one-hot encoding; CHANGE TO YOUR OWN PATH OF YOUR BEST MODEL
ligand_dict = json.load(open('ligand_dict.json'))
protein_dict = json.load(open('protein_dict.json'))
smilelen, seqlen = 200, 2000

# load model
model = DeepDTA(len(protein_dict)+1, len(ligand_dict)+1, 32, 8, 8) # change the kernel size if needed
model.load_state_dict(torch.load('deepdta_retrain.pt'))
model.eval()

df_result = []
for i in range(len(df)):
    ligand = df.iloc[i]['ligands']
    protein = df.iloc[i]['proteins']
    protein = [protein_dict[x] for x in protein] + [protein_dict['dummy']] * (seqlen - len(protein))
    ligand = [ligand_dict[x] for x in ligand] + [ligand_dict['dummy']] * (smilelen - len(ligand))
    ligand = torch.tensor(ligand).unsqueeze(0)
    protein = torch.tensor(protein).unsqueeze(0)
    with torch.no_grad():
        result = model(protein, ligand)
    df_result.append(result.item())

df_result = np.array(df_result)
ground_truth = df['affinity'].values

In [None]:
plt.scatter(ground_truth, df_result)
plt.plot([4,8], [4,8], 'k--', lw=4)
print("Pearson correlation coefficient: {}".format(pearsonr(ground_truth, df_result)[0]))
print("Spearman correlation coefficient: {}".format(spearmanr(ground_truth, df_result)[0]))