In [1]:
import pandas as pd
from rdkit import Chem

def replace_first_space(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            first_space_index = line.find(' ')
            modified_line = line[:first_space_index] + ';' + line[first_space_index + 1:]
            outfile.write(modified_line)

def eli_lilly_filter(flagreason: str):
    rules = [
             "no_interesting_atoms",
             "too_few_rings",
             "too_many_aromatic_rings_in_ring_system",
             "ring_system_too_large",
             "ring_system_too_large_with_aromatic",
             "fmoc",
             "positive",
             "negative",
             "too_long_carbon_chain",
             "diphosphate",
             "quaternary_amine",
             "sulfonic_acid",
             "quaternary_amine",
             "crown_2_2_cyclic",
             "crown_3_3_cyclic",
             "crown_2_3_cyclic",
             "crown_2_2",
             "crown_3_3",
             "crown_2_3",
             ]
    for i in rules:
        if i in flagreason:
            return None
    return 0

In [2]:
replace_first_space("./original_outputs/bad0.smi", "./parsed_outputs/parsed_smi_0.smi")
replace_first_space("./original_outputs/bad1.smi", "./parsed_outputs/parsed_smi_1.smi")
replace_first_space("./original_outputs/bad2.smi", "./parsed_outputs/parsed_smi_2.smi")
replace_first_space("./original_outputs/bad3.smi", "./parsed_outputs/parsed_smi_3.smi")

In [3]:
df0 = pd.read_csv("./parsed_outputs/parsed_smi_0.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])
df1 = pd.read_csv("./parsed_outputs/parsed_smi_1.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])
df2 = pd.read_csv("./parsed_outputs/parsed_smi_2.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])
df3 = pd.read_csv("./parsed_outputs/parsed_smi_3.smi",
                  delimiter=";", names=["SMILES", "FlagReason"])
df = pd.concat([df0, df1, df2, df3])
df

Unnamed: 0,SMILES,FlagReason
0,C#CC1=CC=C(OCC2=N[N]C=C2C2=CC=CC3=C2NC(=C3CCCO...,TP1 abnormal_valence
1,C1=CC(=S)SS1,TP1 no_interesting_atoms
2,C1=CSC(=S)S1,TP1 no_interesting_atoms
3,C1C2=C(C(=S)SS2)CC1,TP1 no_interesting_atoms
4,OB(O)C1=CC(=C(C2=CNC3=NC=C(C4=CN=CC(=C4)C(=O)N...,TP1 non_allowed_atom
...,...,...
21,C1=NN2C=CC(=NC2=C1C(=O)NC1=CN(N=C1C(F)F)C1CCC(...,: D(130) too_many_rings:acetylene
22,OC(=O)[C@@H](CC1=CC=CC(=C1)CN(CC1=CC=CC(=C1)C[...,: D(250) positive:negative
23,OC[C@H]1O[C@@H](O[C@@H]2OC=C3C(=O)OCC[C@@]3(O)...,: D(110) acetal_1_in_ring:michael_demerited
24,C1=CC(=CC2=C1OCCO2)C1=NN(C(=O)C)[C@@H](C2=CC=C...,: D(100) nitro:acyl_hydrazone_cyclic


In [4]:
df["remove"] = df.FlagReason.apply(eli_lilly_filter)
df["InChI"] = df.SMILES.apply(lambda x: Chem.MolToInchi(Chem.MolFromSmiles(x)))
df = df.dropna().drop("remove", axis=1)
df["pred"] = 1
df










































































































Unnamed: 0,SMILES,FlagReason,InChI,pred
0,C#CC1=CC=C(OCC2=N[N]C=C2C2=CC=CC3=C2NC(=C3CCCO...,TP1 abnormal_valence,InChI=1S/C35H26N3O5/c1-2-22-15-16-25(18-24(22)...,1
4,OB(O)C1=CC(=C(C2=CNC3=NC=C(C4=CN=CC(=C4)C(=O)N...,TP1 non_allowed_atom,InChI=1S/C23H21BN4O5/c1-28(2)23(30)15-4-13(8-2...,1
5,OB1OC2=C(C(=O)O)C=CC=C2C[C@@H]1NC(=O)CC1CCC(NC...,TP1 non_allowed_atom,InChI=1S/C19H28BN3O5/c21-8-9-22-14-6-4-12(5-7-...,1
6,OB1O[C@H](CN)C2=C1C(=CC=C2Cl)OCCO,TP1 non_allowed_atom,InChI=1S/C10H13BClNO4/c12-6-1-2-7(16-4-3-14)10...,1
7,OB1OCC2=C1C(=CC(=C2)N(C1=C(C2CC2)C=C2C(=C(C3=C...,TP1 non_allowed_atom,InChI=1S/C27H23BClFN2O6S/c1-31-27(33)24-20-11-...,1
...,...,...,...,...
18,C1=CC(=CC=C1Br)S(=O)(=O)N[C@H](C1=NNC(=O)O1)[C...,: D(114) acyl_isoamide_aromatic:bromine:hydraz...,InChI=1S/C21H18BrN3O4S/c1-13(17-8-4-6-14-5-2-3...,1
21,C1=NN2C=CC(=NC2=C1C(=O)NC1=CN(N=C1C(F)F)C1CCC(...,: D(130) too_many_rings:acetylene,InChI=1S/C44H49F2N11O6/c1-52-39-27(4-2-6-34(39...,1
23,OC[C@H]1O[C@@H](O[C@@H]2OC=C3C(=O)OCC[C@@]3(O)...,: D(110) acetal_1_in_ring:michael_demerited,InChI=1S/C16H22O10/c1-2-7-14(24-6-8-13(21)23-4...,1
24,C1=CC(=CC2=C1OCCO2)C1=NN(C(=O)C)[C@@H](C2=CC=C...,: D(100) nitro:acyl_hydrazone_cyclic,InChI=1S/C18H15N3O6/c1-11(22)20-18(12-2-5-14(6...,1


In [5]:
test_data_df = pd.read_csv("../data/SMILES_test/test_data_all.csv")
test_data_df

Unnamed: 0,SMILES,InChI,warhead_category,covalent
0,[H]C#CN([H])C(=O)c1nc(N2C([H])([H])C([H])([H])...,InChI=1S/C25H26N8O3/c1-2-26-24(36)20-13-22(28-...,aldehyde,1
1,[H]C#CN([H])C(=O)c1nc(N2C([H])([H])C([H])([H])...,InChI=1S/C25H26N8O3/c1-2-26-24(36)20-13-22(28-...,aldehyde,1
2,[H]Oc1c([H])c(OC([H])([H])[H])c(-c2c([H])n([H]...,InChI=1S/C23H20N4O4/c1-27(2)23(30)15-4-13(8-24...,aldehyde,1
3,[H]C(=O)c1c([H])c([H])c(S(=O)(=O)N2C([H])([H])...,InChI=1S/C13H17NO4S/c1-10-7-14(8-11(2)18-10)19...,aldehyde,1
4,[H]C(=O)c1nc2c(C(=O)N([H])c3nc([H])c(C(F)(F)F)...,"InChI=1S/C17H10F3N3O2/c18-17(19,20)11-5-7-14(2...",aldehyde,1
...,...,...,...,...
605,[H]C([H])([H])C1=NC([H])([H])C([H])([H])C1([H]...,"InChI=1S/C5H9N/c1-5-3-2-4-6-5/h2-4H2,1H3",noncovalentdecoy,0
606,[H]OC([H])([H])[C@@]1([H])O[C@]1(C([H])([H])[H...,InChI=1S/C10H18O2/c1-8(2)5-4-6-10(3)9(7-11)12-...,noncovalentdecoy,0
607,[H]OC([H])([H])/C([H])=C(\C([H])([H])[H])C([H]...,"InChI=1S/C10H18O2/c1-8(6-7-11)4-5-9-10(2,3)12-...",noncovalentdecoy,0
608,[H]OC([H])([H])C([H])([H])[C@]([H])(C([H])([H]...,"InChI=1S/C10H20O2/c1-8(6-7-11)4-5-9-10(2,3)12-...",noncovalentdecoy,0


In [6]:
df_final = test_data_df.merge(df, how="left", on=["InChI"])[["InChI", "covalent",
                                                                   "FlagReason", "pred"]]
df_final.pred = df_final.pred.fillna(0)
df_final

Unnamed: 0,InChI,covalent,FlagReason,pred
0,InChI=1S/C25H26N8O3/c1-2-26-24(36)20-13-22(28-...,1,(1 matches to 'acetylene_heteroatom'),1.0
1,InChI=1S/C25H26N8O3/c1-2-26-24(36)20-13-22(28-...,1,(1 matches to 'acetylene_heteroatom'),1.0
2,InChI=1S/C23H20N4O4/c1-27(2)23(30)15-4-13(8-24...,1,(1 matches to 'aldehyde'),1.0
3,InChI=1S/C13H17NO4S/c1-10-7-14(8-11(2)18-10)19...,1,(1 matches to 'aldehyde'),1.0
4,"InChI=1S/C17H10F3N3O2/c18-17(19,20)11-5-7-14(2...",1,(1 matches to 'aldehyde'),1.0
...,...,...,...,...
605,"InChI=1S/C5H9N/c1-5-3-2-4-6-5/h2-4H2,1H3",0,,0.0
606,InChI=1S/C10H18O2/c1-8(2)5-4-6-10(3)9(7-11)12-...,0,(1 matches to 'het_3mem_ring'),1.0
607,"InChI=1S/C10H18O2/c1-8(6-7-11)4-5-9-10(2,3)12-...",0,(1 matches to 'het_3mem_ring'),1.0
608,"InChI=1S/C10H20O2/c1-8(6-7-11)4-5-9-10(2,3)12-...",0,(1 matches to 'het_3mem_ring'),1.0


In [7]:
from sklearn.metrics import precision_score, recall_score
(
    precision_score(df_final.covalent, df_final.pred),
    recall_score(df_final.covalent, df_final.pred),
)




(0.8666666666666667, 0.7665094339622641)