In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv("annotated.csv")
df

Unnamed: 0,pocket,protein,ligand,rank,rmsd,obrmsd,flexobrmsd,fmaxrmsd,score,group,annotation
0,ABL1,1FPU,3QRK,0,1.12474,1.12474,0.75627,1.09286,-11.82862,1,1
1,ABL1,1FPU,3QRK,10,7.46179,7.46179,1.38199,2.77094,-11.55337,1,0
2,ABL1,1FPU,3QRK,11,8.08863,8.08863,1.44861,2.51482,-11.47044,1,0
3,ABL1,1FPU,3QRK,12,8.47072,8.47072,1.60736,2.34249,-11.35890,1,0
4,ABL1,1FPU,3QRK,13,8.45633,8.45633,1.61014,2.35856,-11.32845,1,0
...,...,...,...,...,...,...,...,...,...,...,...
165586,XIAP,5C84,4KMP,5,9.60539,9.60539,1.43189,2.28256,-7.57331,63,0
165587,XIAP,5C84,4KMP,6,8.41238,8.41238,1.30168,2.39061,-7.51172,63,0
165588,XIAP,5C84,4KMP,7,6.88340,6.88340,0.95547,1.40152,-7.40088,63,0
165589,XIAP,5C84,4KMP,8,5.19961,5.19961,2.08915,2.97329,-7.32203,63,0


In [3]:
def load_scores(df, model, prefix):
    score = pd.concat((
    pd.read_csv(f"../training/{model}/{prefix}test0.out", sep=" ", header=None),
    pd.read_csv(f"../training/{model}/{prefix}test1.out", sep=" ", header=None),
    pd.read_csv(f"../training/{model}/{prefix}test2.out", sep=" ", header=None),
    ))

    if prefix == "nc":
        # No affinity column
        to_drop = [1,2,4,5,6,7]
    elif prefix == "cluster":
        to_drop = [1,2,3,5,6,7,8]


    score.drop(columns=to_drop, inplace=True)
    score.rename(columns={0: "CNNscore", 3: "name", 4: "name"}, inplace=True)
    score.dropna(inplace=True) # Last row contains NaN (it is  actually a comment)

    def getid(row):
        namesplit = row["name"].split("/")

        pocket = namesplit[1]

        split = namesplit[-1].split("_")

        protein = split[0]
        ligand = split[2]


        rank = int(split[-1][1:].replace(".gninatypes", ""))

        return (pocket, protein, ligand, rank)

    score[["pocket", "protein", "ligand", "rank"]] = score.apply(getid, axis=1, result_type="expand")
    score.drop(columns="name", inplace=True)

    df_score = df.merge(score, on=["pocket", "protein", "ligand", "rank"])

    return df_score

In [4]:
#df_score = load_scores(df, model, prefix)
#df_score

In [5]:
def topN(df, nmax, by=["pocket"]):

    n_pockets = 0
    
    top_smina = [0] * nmax
    top_gnina = [0] * nmax
    top_best = [0] * nmax

    for _, group in df.groupby(by):
        n_smina = [0] * nmax
        n_gnina = [0] * nmax
        n_best = [0] * nmax

        n_targets = 0
        for _, tgroup in group.groupby("protein"):
            smina = tgroup.sort_values(by="score", ascending=True)
            gnina = tgroup.sort_values(by="CNNscore", ascending=False)
            best = tgroup.sort_values(by="rmsd", ascending=True)

            for n in range(1, nmax + 1):
                # At least one good pose amongst the top N
                if (smina["annotation"].iloc[:n] == 1).any():
                    n_smina[n-1] += 1

                if (gnina["annotation"].iloc[:n] == 1).any():
                    n_gnina[n-1] += 1
                
                if (best["annotation"].iloc[:n] == 1).any():
                    n_best[n-1] += 1

            n_targets += 1

        # Accumulate results for all targets
        for n in range(1, nmax + 1):
            top_smina[n-1] += n_smina[n-1] / n_targets * 100
            top_gnina[n-1] += n_gnina[n-1]  / n_targets * 100
            top_best[n-1] += n_best[n-1]  / n_targets * 100

        n_pockets += 1

    # One pocket has been removed from the training set
    # for lack of actives
    assert n_pockets == 91

    # Return TopN of targets, averaged per pocket
    top_smina_avg = np.array(top_smina) / n_pockets
    top_gnina_avg = np.array(top_gnina) / n_pockets
    top_best_avg =  np.array(top_best) / n_pockets
    return np.array([list(range(1,nmax+1)), top_smina_avg, top_gnina_avg, top_best_avg]).T

In [6]:
nmax = 10

In [9]:
for model in ["default2017", "default2018", "dense"]:
    prefix = "nc"
    modelname = f"{model}-noaffinity-nostratified"

    df_score = load_scores(df, modelname, prefix)

    for crystal in ["", "nocrystal_"]:
        if crystal == "":
            t  = topN(df_score, nmax)
        elif crystal == "nocrystal":
            t = topN(df_score[df_score["rank"] != 0], nmax)

        df_top = pd.DataFrame(t, columns=["N", "smina", "gnina", "best"])
        df_top.to_csv(f"TopN/{crystal}{modelname}.csv", index=None)

In [None]:
for model in ["default2017", "default2018", "dense"]:
    prefix = "cluster"
    modelname = f"{model}-noaffinity"

    df_score = load_scores(df, modelname, prefix)

    for crystal in ["", "nocrystal_"]:
        if crystal == "":
            t  = topN(df_score, nmax)
        elif crystal == "nocrystal":
            t = topN(df_score[df_score["rank"] != 0], nmax)

        df_top = pd.DataFrame(t, columns=["N", "smina", "gnina", "best"])
        df_top.to_csv(f"TopN/{crystal}{modelname}.csv", index=None)

In [None]:
def plot(top):
    plt.plot(top[:,0], top[:,1], "o-", label="smina")
    plt.plot(top[:,0], top[:,2], "o-", label="gnina")
    plt.plot(top[:,0], top[:,3], "k--", label="best")
    plt.legend()
    plt.xlabel("N")
    plt.ylabel("TopN (%)")
    plt.plot()

In [None]:
crystal_nostratified = pd.concat(
    (
        pd.read_csv("TopN/default2017-noaffinity-nostratified.csv",index_col=0).rename(columns={"gnina": "default2017"}),
        pd.read_csv("TopN/default2018-noaffinity-nostratified.csv",index_col=0).drop(columns=["smina", "best"]).rename(columns={"gnina": "default2018"}),
        pd.read_csv("TopN/dense-noaffinity-nostratified.csv",index_col=0).drop(columns=["smina", "best"]).rename(columns={"gnina": "dense"}),
    ),
    axis=1,
)
crystal_nostratified

In [None]:
sns.lineplot(data=crystal_nostratified)
plt.savefig("plot/TopN-crystal-nostratified.pdf")
plt.savefig("plot/TopN-crystal-nostratified.png")

In [None]:
crystal_stratified = pd.concat(
    (
        pd.read_csv("TopN/default2017-noaffinity.csv",index_col=0).rename(columns={"gnina": "default2017"}),
        pd.read_csv("TopN/default2018-noaffinity.csv",index_col=0).drop(columns=["smina", "best"]).rename(columns={"gnina": "default2018"}),
        pd.read_csv("TopN/dense-noaffinity.csv",index_col=0).drop(columns=["smina", "best"]).rename(columns={"gnina": "dense"}),
    ),
    axis=1,
)
crystal_stratified

In [None]:
sns.lineplot(data=crystal_stratified)
plt.savefig("plot/TopN-crystal-stratified.pdf")
plt.savefig("plot/TopN-crystal-stratified.png")

In [None]:
nocrystal_nostratified = pd.concat(
    (
        pd.read_csv("TopN/nocrystal_default2017-noaffinity-nostratified.csv",index_col=0).rename(columns={"gnina": "default2017"}),
        pd.read_csv("TopN/nocrystal_default2018-noaffinity-nostratified.csv",index_col=0).drop(columns=["smina", "best"]).rename(columns={"gnina": "default2018"}),
        pd.read_csv("TopN/nocrystal_dense-noaffinity-nostratified.csv",index_col=0).drop(columns=["smina", "best"]).rename(columns={"gnina": "dense"}),
    ),
    axis=1,
)
nocrystal_nostratified

In [None]:
sns.lineplot(data=nocrystal_nostratified)
plt.savefig("plot/TopN-nocrystal-nostratified.pdf")
plt.savefig("plot/TopN-nocrystal-nostratified.png")

In [None]:
nocrystal_stratified = pd.concat(
    (
        pd.read_csv("TopN/nocrystal_default2017-noaffinity.csv",index_col=0).rename(columns={"gnina": "default2017"}),
        pd.read_csv("TopN/nocrystal_default2018-noaffinity.csv",index_col=0).drop(columns=["smina", "best"]).rename(columns={"gnina": "default2018"}),
        pd.read_csv("TopN/nocrystal_dense-noaffinity.csv",index_col=0).drop(columns=["smina", "best"]).rename(columns={"gnina": "dense"}),
    ),
    axis=1,
)
nocrystal_stratified

In [None]:
sns.lineplot(data=nocrystal_stratified)
plt.savefig("plot/TopN-nocrystal-stratified.pdf")
plt.savefig("plot/TopN-nocrystal-stratified.png")