In [1]:
import pandas as pd

In [3]:
np_df = pd.read_csv("C:\\Users\\Kalki\\Documents\\code\\dfs\\experiments\\lstm-50-epochs-48-batch-size-no-penalty\\lstm-50-epochs-48-batch-size-no-penalty\\results\\samples\\generated_molecules.csv")

In [7]:
np_df["decoded_text"] = np_df["decoded_text"].apply(lambda x: x.replace(" ", ""))

In [8]:
np_df.head()

Unnamed: 0,sequence_id,decoded_text,sequence_length,generation_time,starter_text
0,0,[1*]C(=O)C([4*])C.[1*]C(=O)C(C)OCC(O)CO[3*]|[1...,257,1.141682,
1,1,[1*]C(=O)CCCCC[4*].[1*]C(=O)C(O)C(C)C(=O)OC(=O...,269,1.037681,
2,2,[1*]C(=O)CCCCC[4*].[1*]C(=O)C(C)NC(=O)CCCCCNCN...,297,1.432173,
3,3,[1*]C(=O)CCCCC[4*].[1*]C(=O)CNCNCC(=O)O[3*]|[3...,255,1.119519,
4,4,[1*]C(=O)CCC([1*])=O.[1*]C(=O)CCCCCCCCC(=O)NCN...,265,0.89967,


In [None]:
from rdkit import Chem
from rdkit.Chem import rdChemReactions

class StringValidator:
    """
    Example -> [5*]NCN[5*].*Nc1ccc(NCCC[4*])cc1|[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[#6]):1]-&!@[$([N&!D1&!$(N=*)&!$(N-[!#6&!#16&!#0&!#1])&!$([N&R]@[C&R]=O)]):2]|0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5 

    Part 1 -> SMILES ->  [5*]NCN[5*].*Nc1ccc(NCCC[4*])cc1
    Part 2 -> Reaction SMARTS ->  [4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[#6]):1]-&!@[$([N&!D1&!$(N=*)&!$(N-[!#6&!#16&!#0&!#1])&!$([N&R]@[C&R]=O)]):2]
    Part 3 -> Weight Distribution -> |0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5

    Validation Layer:

    1. Grammar Validation
        - Check for valid SMILES, SMARTS syntax
        - Ensure correct use of atom and bond symbols
        - Ensure correct weight variation is assigned  
    2. Syntax Validation
        - Ensure right indexing from the reaction components
        - Check for correct use of brackets and parentheses and transformations
        - Check for placement of "|" in the final representation
    """

    def __init__(self, weight_variations=None):
        if weight_variations is None:
            weight_variations = [
                "|0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5",
                "|0.5|0.5|<1-2:0.375:0.375<1-1:0.375:0.375<2-2:0.375:0.375<3-4:0.375:0.375<3-3:0.375:0.375<4-4:0.125:0.125<1-3:0.125:0.125<1-4:0.125:0.125<2-3:0.125:0.125<2-4:0.125:0.125"
            ]
        self.weight_variations = weight_variations

    @staticmethod
    def parse_parts(value: str):
        split_values = value.split("|")
        smiles_part = split_values[0]
        smarts_part = split_values[1]
        weight_part = "|" + "|".join(split_values[2:])
        return smiles_part, smarts_part, weight_part

    def _validate_smiles_grammar_candidate(self, str_part: str) -> None:
        try:
            Chem.MolFromSmiles(str_part)
        except Exception as e:
            print("Invalid SMILES:", str_part)
            raise e

    def _validate_smarts_reaction(self, str_part: str) -> None:
        rxn = rdChemReactions.ReactionFromSmarts(str_part) 
        if rxn is None:
            raise ValueError("Invalid Reaction SMARTS")

    def _validate_grammar(self, candidate: str) -> None:
        smiles_section, smarts_reaction_section, weight_val = StringValidator.parse_parts(candidate)
        if weight_val not in self.weight_variations:
            raise ValueError("Invalid Weight Variation")
        self._validate_smiles_grammar_candidate(smiles_section)
        self._validate_smarts_reaction(smarts_reaction_section)

    def _validate_syntax(self, candidate: str) -> None:
        ...

    def _validate_mapping(self, candidate: str) -> None:
        ...

    def validate(self, candidate: str):
        try:
            self._validate_grammar(candidate)
            # self._validate_syntax(candidate)
            # self._validate_mapping(candidate)
            return True
        except Exception as e:
            print("Validation Error:", e)
            return False

In [34]:
validator = StringValidator()
valid_count = 0
for idx, row in np_df.iterrows():
    candidate_str = row["decoded_text"]
    is_valid = validator.validate(candidate_str)
    if not is_valid:
        print(f"Invalid candidate at index {idx}: {candidate_str}")
    else:
        valid_count += 1
print(f"Total valid candidates: {valid_count}")

smiles section  [1*]C(=O)C([4*])C.[1*]C(=O)C(C)OCC(O)CO[3*]
smarts reaction section >> [1*]-[*:1].[3*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8])=O):1]-&!@[$([O&D2]-&!@[#0,#6,#1]):2]
weight val >> |0.5|0.5|<1-2:0.375:0.375<1-1:0.375:0.375<2-2:0.375:0.375<3-4:0.375:0.375<3-3:0.375:0.375<4-4:0.125:0.125<1-3:0.125:0.125<1-4:0.125:0.125<2-3:0.125:0.125<2-4:0.125:0.125
weight variations >> ['|0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5', '|0.5|0.5|<1-2:0.375:0.375<1-1:0.375:0.375<2-2:0.375:0.375<3-4:0.375:0.375<3-3:0.375:0.375<4-4:0.125:0.125<1-3:0.125:0.125<1-4:0.125:0.125<2-3:0.125:0.125<2-4:0.125:0.125']
smiles section  [1*]C(=O)CCCCC[4*].[1*]C(=O)C(O)C(C)C(=O)OC(=O)C([3*])O
smarts reaction section >> [3*]-[*:1].[4*]-[*:2]>>[$([O&D2]-&!@[#0,#6,#1]):1]-&!@[$([C&!D1&!$(C=*)]-&!@[#6]):2]
weight val >> |0.5|0.5|<1-2:0.375:0.375<1-1:0.375:0.375<2-2:0.375:0.375<3-4:0.375:0.375<3-3:0.375:0.375<4-4:0.125:0.125<1-3:0.125:0.125<1-4:0.125:0.125<2-3:0.125:0.125<2-4:0.125:0.125
weight variations >>

In [35]:
validator.validate("[5*]NCN[5*].*Nc1ccc(NCCC[4*])cc1|[4*][*:1].[5*][*:2]>>[*:1][*:2]|[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[#6]):1]-&!@[$([N&!D1&!$(N=*)&!$(N-[!#6&!#16&!#0&!#1])&!$([N&R]@[C&R]=O)]):2]|0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5")

smiles section  [5*]NCN[5*].*Nc1ccc(NCCC[4*])cc1
smarts reaction section >> [4*][*:1].[5*][*:2]>>[*:1][*:2]
weight val >> |[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[#6]):1]-&!@[$([N&!D1&!$(N=*)&!$(N-[!#6&!#16&!#0&!#1])&!$([N&R]@[C&R]=O)]):2]|0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5
weight variations >> ['|0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5', '|0.5|0.5|<1-2:0.375:0.375<1-1:0.375:0.375<2-2:0.375:0.375<3-4:0.375:0.375<3-3:0.375:0.375<4-4:0.125:0.125<1-3:0.125:0.125<1-4:0.125:0.125<2-3:0.125:0.125<2-4:0.125:0.125']
Validation Error: Invalid Weight Variation


False

In [28]:
|0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5
|0.5|0.5|<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5

SyntaxError: invalid syntax (3036539251.py, line 1)