In [1]:
import pandas as pd
from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

def replace_first_space(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            first_space_index = line.find(' ')
            modified_line = line[:first_space_index] + ';' + line[first_space_index + 1:]
            outfile.write(modified_line)

def eli_lilly_filter(flagreason: str):
    rules = [
        "no_interesting_atoms",
        "too_few_rings",
        "too_many_rings",
        "biotin",
        "too_many_aromatic_rings_in_ring_system",
        "ring_system_too_large",
        "ring_system_too_large_with_aromatic",
        "fmoc",
        "positive",
        "negative",
        "too_long_carbon_chain",
        "diphosphate",
        "quaternary_amine",
        "sulfonic_acid",
        "quaternary_amine",
        "crown_2_2_cyclic",
        "crown_3_3_cyclic",
        "crown_2_3_cyclic",
        "crown_2_2",
        "crown_3_3",
        "crown_2_3",
        "phenylenediamine",
        "LongCChain"
             ]
    for i in rules:
        if i in flagreason:
            return None
    return 0

In [2]:
replace_first_space("./original_outputs/bad0.smi", "./parsed_outputs/parsed_smi_0.smi")
replace_first_space("./original_outputs/bad1.smi", "./parsed_outputs/parsed_smi_1.smi")
replace_first_space("./original_outputs/bad2.smi", "./parsed_outputs/parsed_smi_2.smi")
replace_first_space("./original_outputs/bad3.smi", "./parsed_outputs/parsed_smi_3.smi")
replace_first_space("./original_outputs/good.smi", "./parsed_outputs/good.smi")

In [3]:
df0 = pd.read_csv("./parsed_outputs/parsed_smi_0.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])
df1 = pd.read_csv("./parsed_outputs/parsed_smi_1.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])
df2 = pd.read_csv("./parsed_outputs/parsed_smi_2.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])
df3 = pd.read_csv("./parsed_outputs/parsed_smi_3.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])
df4 = pd.read_csv("./parsed_outputs/good.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])

df_pos = pd.concat([df0, df1, df2, df3])
df_pos["pred"] = 1
df_neg = df4
df_neg["pred"] = 0
df = pd.concat([df_pos, df_neg])
df

Unnamed: 0,SMILES,FlagReason,pred
0,C#CC1=CC=C(OCC2=N[N]C=C2C2=CC=CC3=C2NC(=C3CCCO...,TP1 abnormal_valence,1
1,C1=CC(=S)SS1,TP1 no_interesting_atoms,1
2,C1=CSC(=S)S1,TP1 no_interesting_atoms,1
3,C1C2=C(C(=S)SS2)CC1,TP1 no_interesting_atoms,1
4,OB(O)C1=CC(=C(C2=CNC3=NC=C(C4=CN=CC(=C4)C(=O)N...,TP1 non_allowed_atom,1
...,...,...,...
176,NC(=O)N1C2=CC=CC=C2[C@@H]2O[C@@H]2C2=CC=CC=C12,: D(50) het_3mem_ring_fused,0
177,C1=CC=C2C(=C1)C(=C1CCN(C)CC1)C1=CC=CC=C1[C@H]1...,: D(50) het_3mem_ring_fused,0
178,C1=CC=C2C(=C1)N=CC(=O)N2,C1=CC=C2C(=C1)N=CC(=O)N2,0
179,C#CCN1C(=O)C=NC2=CC=CC=C12,: D(50) acetylene,0


In [4]:
df["remove"] = df.FlagReason.apply(eli_lilly_filter)
df = df.dropna().drop("remove", axis=1)
df["InChI"] = df.SMILES.apply(lambda x: Chem.MolToInchi(Chem.MolFromSmiles(x)))
df

Unnamed: 0,SMILES,FlagReason,pred,InChI
0,C#CC1=CC=C(OCC2=N[N]C=C2C2=CC=CC3=C2NC(=C3CCCO...,TP1 abnormal_valence,1,InChI=1S/C35H26N3O5/c1-2-22-15-16-25(18-24(22)...
4,OB(O)C1=CC(=C(C2=CNC3=NC=C(C4=CN=CC(=C4)C(=O)N...,TP1 non_allowed_atom,1,InChI=1S/C23H21BN4O5/c1-28(2)23(30)15-4-13(8-2...
5,OB1OC2=C(C(=O)O)C=CC=C2C[C@@H]1NC(=O)CC1CCC(NC...,TP1 non_allowed_atom,1,InChI=1S/C19H28BN3O5/c21-8-9-22-14-6-4-12(5-7-...
6,OB1O[C@H](CN)C2=C1C(=CC=C2Cl)OCCO,TP1 non_allowed_atom,1,InChI=1S/C10H13BClNO4/c12-6-1-2-7(16-4-3-14)10...
7,OB1OCC2=C1C(=CC(=C2)N(C1=C(C2CC2)C=C2C(=C(C3=C...,TP1 non_allowed_atom,1,InChI=1S/C27H23BClFN2O6S/c1-31-27(33)24-20-11-...
...,...,...,...,...
176,NC(=O)N1C2=CC=CC=C2[C@@H]2O[C@@H]2C2=CC=CC=C12,: D(50) het_3mem_ring_fused,0,InChI=1S/C15H12N2O2/c16-15(18)17-11-7-3-1-5-9(...
177,C1=CC=C2C(=C1)C(=C1CCN(C)CC1)C1=CC=CC=C1[C@H]1...,: D(50) het_3mem_ring_fused,0,InChI=1S/C21H21NO/c1-22-12-10-14(11-13-22)19-1...
178,C1=CC=C2C(=C1)N=CC(=O)N2,C1=CC=C2C(=C1)N=CC(=O)N2,0,InChI=1S/C8H6N2O/c11-8-5-9-6-3-1-2-4-7(6)10-8/...
179,C#CCN1C(=O)C=NC2=CC=CC=C12,: D(50) acetylene,0,InChI=1S/C11H8N2O/c1-2-7-13-10-6-4-3-5-9(10)12...


In [5]:
test_data_df = pd.read_csv("../data/SMILES_test/test_data_all.csv")
test_data_df["InChI"] = test_data_df.SMILES.apply(lambda x: Chem.MolToInchi(Chem.MolFromSmiles(x)))
test_data_df = test_data_df[["InChI", "warhead_category", "covalent"]]

In [6]:
df_final = df.merge(test_data_df, how="left", on=["InChI"])
# [["InChI", "covalent", "FlagReason", "pred"]]
# df_final.pred = df_final.pred.fillna(0)
# df_final = df_final.dropna()
df_final = df_final.dropna()
df_final

Unnamed: 0,SMILES,FlagReason,pred,InChI,warhead_category,covalent
0,C#CC1=CC=C(OCC2=N[N]C=C2C2=CC=CC3=C2NC(=C3CCCO...,TP1 abnormal_valence,1,InChI=1S/C35H26N3O5/c1-2-22-15-16-25(18-24(22)...,alkyne,1.0
1,OB(O)C1=CC(=C(C2=CNC3=NC=C(C4=CN=CC(=C4)C(=O)N...,TP1 non_allowed_atom,1,InChI=1S/C23H21BN4O5/c1-28(2)23(30)15-4-13(8-2...,boronic,1.0
2,OB1OC2=C(C(=O)O)C=CC=C2C[C@@H]1NC(=O)CC1CCC(NC...,TP1 non_allowed_atom,1,InChI=1S/C19H28BN3O5/c21-8-9-22-14-6-4-12(5-7-...,boronic,1.0
3,OB1O[C@H](CN)C2=C1C(=CC=C2Cl)OCCO,TP1 non_allowed_atom,1,InChI=1S/C10H13BClNO4/c12-6-1-2-7(16-4-3-14)10...,boronic,1.0
4,OB1OCC2=C1C(=CC(=C2)N(C1=C(C2CC2)C=C2C(=C(C3=C...,TP1 non_allowed_atom,1,InChI=1S/C27H23BClFN2O6S/c1-31-27(33)24-20-11-...,boronic,1.0
...,...,...,...,...,...,...
558,NC(=O)N1C2=CC=CC=C2[C@@H]2O[C@@H]2C2=CC=CC=C12,: D(50) het_3mem_ring_fused,0,InChI=1S/C15H12N2O2/c16-15(18)17-11-7-3-1-5-9(...,noncovalentdecoy,0.0
559,C1=CC=C2C(=C1)C(=C1CCN(C)CC1)C1=CC=CC=C1[C@H]1...,: D(50) het_3mem_ring_fused,0,InChI=1S/C21H21NO/c1-22-12-10-14(11-13-22)19-1...,noncovalentdecoy,0.0
560,C1=CC=C2C(=C1)N=CC(=O)N2,C1=CC=C2C(=C1)N=CC(=O)N2,0,InChI=1S/C8H6N2O/c11-8-5-9-6-3-1-2-4-7(6)10-8/...,noncovalentdecoy,0.0
561,C#CCN1C(=O)C=NC2=CC=CC=C12,: D(50) acetylene,0,InChI=1S/C11H8N2O/c1-2-7-13-10-6-4-3-5-9(10)12...,noncovalentdecoy,0.0


In [7]:
from sklearn.metrics import precision_score, recall_score
(
    precision_score(df_final.covalent, df_final.pred),
    recall_score(df_final.covalent, df_final.pred),
)




(0.8695652173913043, 0.8695652173913043)

In [16]:
import tensorflow as tf
from helpers import encoder
model_file = "../saved_models/GCNII"
model = tf.keras.models.load_model(model_file)
gnn_pred = model.predict(encoder(df_final.InChI.values))
df_final["gnn_pred"] = tf.round(gnn_pred)



In [17]:
(
    precision_score(df_final.covalent, df_final.gnn_pred),
    recall_score(df_final.covalent, df_final.gnn_pred),
)

(0.8915254237288136, 0.7146739130434783)