In [1]:
from rdkit import Chem
import joblib

In [2]:
at_repo = joblib.load("./finals/brics_iter_at.pkl")
smiles_repo = joblib.load("./finals/brics_iter_smiles.pkl")
raw_repo = joblib.load("./finals/brics_iter.pkl")

In [3]:
print("length of at_repo:", len(at_repo))
print("length of smiles_repo:", len(smiles_repo))
print("length of raw_repo:", len(raw_repo))

length of at_repo: 132500
length of smiles_repo: 7590
length of raw_repo: 106749


In [4]:
at_repo = [x for x in at_repo if type(x) is str]
smiles_repo = [x for x in smiles_repo if type(x) is str]
raw_repo = [x for x in raw_repo if type(x) is str]

In [5]:
print("length of at_repo:", len(at_repo))
print("length of smiles_repo:", len(smiles_repo))
print("length of raw_repo:", len(raw_repo))

length of at_repo: 122428
length of smiles_repo: 6970
length of raw_repo: 106056


In [8]:
def parser(str_val):
    values = str_val.split("|")
    return (values[2], values[4], values[6], values[7], values[10], values[11])

In [9]:
parser(at_repo[0])

('[3*]O[3*]',
 '[4*]CCCCCC(=O)[At]',
 '[3*][*:1].[4*][*:2]>>[*:1][*:2] ',
 'atomProp:1.molAtomMapNumber.1:3.molAtomMapNumber.2:4.molAtomMapNumber.1:5.molAtomMapNumber.2',
 '[3*]-[*:1].[4*]-[*:2]>>[$([O&D2]-&!@[#0,#6,#1]):1]-&!@[$([C&!D1&!$(C=*)]-&!@[#6]):2] ',
 'atomProp:1.molAtomMapNumber.1:3.molAtomMapNumber.2:4.molAtomMapNumber.1:5.molAtomMapNumber.2')

In [10]:
at_repo = list(map(parser, at_repo))
smiles_repo = list(map(parser, smiles_repo))
raw_repo = list(map(parser, raw_repo))

In [49]:
# every data in the repo datasets contain this form of data
# ('[3*]O[3*]',
#  '[4*]CCCCCC(=O)[At]',
#  '[3*][*:1].[4*][*:2]>>[*:1][*:2] ',
#  'atomProp:1.molAtomMapNumber.1:3.molAtomMapNumber.2:4.molAtomMapNumber.1:5.molAtomMapNumber.2',
#  '[3*]-[*:1].[4*]-[*:2]>>[$([O&D2]-&!@[#0,#6,#1]):1]-&!@[$([C&!D1&!$(C=*)]-&!@[#6]):2] ',
#  'atomProp:1.molAtomMapNumber.1:3.molAtomMapNumber.2:4.molAtomMapNumber.1:5.molAtomMapNumber.2')
# create pandas dataframe for each repo with columns : fragment, seed, reaction_smiles, reaction_smiles_outro, reaction_smarts, reaction_smarts_outro

import pandas as pd
at_df = pd.DataFrame(at_repo, columns=["fragment", "seed", "reaction_smiles", "reaction_smiles_outro", "reaction_smarts", "reaction_smarts_outro"])
smiles_df = pd.DataFrame(smiles_repo, columns=["fragment", "seed", "reaction_smiles", "reaction_smiles_outro", "reaction_smarts", "reaction_smarts_outro"])
raw_df = pd.DataFrame(raw_repo, columns=["fragment", "seed", "reaction_smiles", "reaction_smiles_outro", "reaction_smarts", "reaction_smarts_outro"])

In [50]:
at_df.shape, smiles_df.shape, raw_df.shape

((122428, 6), (6970, 6), (106056, 6))

In [51]:
def at_repo_converter(df):
    """
    This will act as a mapping function where you will take the fragment and seed columns value and remove the "[At]" text part and assign "*" in them 
    
    use this logic to remove the indexed wildcard 
    
    if re.search(r"\[\d+\*]", s):
        s = re.sub(r"\[\d+\*]", "*", s)
    
    and place a simple "*" in place of it.
    and join both the string values with a dot (.) in between. and add a column smiles and place the resultant value in it. and keep the rest of the columns as it is.
    """
    df["smiles"] = df["fragment"].str.replace(r"\[\d+\*]", "*", regex=True) + "." + df["seed"].str.replace(r"\[\d+\*]", "*", regex=True)
    df["smiles"] = df["smiles"].str.replace("[At]", "*", regex=False)
    return df

  """


In [77]:
def wildcard_converter(df):
    df["smiles"] = df["fragment"].str.replace(r"\[\d+\*]", "*", regex=True) + "." + df["seed"].str.replace(r"\[\d+\*]", "*", regex=True)
    return df

In [None]:
at_df = at_repo_converter(at_df)

In [58]:
def filter_non_single_atom(smiles_value):
    """
    This will filter the dataframe to remove the candidates which have a split that contains a single atom in the smiles column.
    for example.

    "*O*.*CCC*" should be removed as it contains a single atom "O" in the smiles column.
    "*CCCC*.*F*" should be removed as it contains a single atom "F" in the smiles column. 
    """
    splits = smiles_value.split(".")
    if len(splits) == 2 and any(len(split) == 3 for split in splits):
        return False 
    return True 

In [60]:
at_non_single_atom_df = at_df[at_df["smiles"].apply(filter_non_single_atom)]

In [62]:
def filter_exact_two_wildcards(smiles_value):
    """
    This will filter the dataframe to remove the candidates which have a split that contains exactly two wildcards in the smiles column.
    for example.

    "*O*.*CCC*" should be removed as it contains exactly two wildcards in the smiles column.
    "*CCCC*.*F*" should be removed as it contains exactly two wildcards in the smiles column. 
    """
    splits = smiles_value.split(".")
    if len(splits) == 2 and splits[0].count("*") == 2 and splits[1].count("*") == 2:
        return True 
    return False 

In [65]:
filter_exact_two_wildcards("*C(*)=O.*NCNCCCCCC(=O)*")

True

In [63]:
at_valid_poly = at_non_single_atom_df[at_non_single_atom_df["smiles"].apply(filter_exact_two_wildcards)]

In [64]:
at_valid_poly.head()

Unnamed: 0,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro,smiles
3,[5*]NCN[5*],[4*]CCCCCC(=O)[At],[4*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,*NCN*.*CCCCCC(=O)*
5,[1*]C([6*])=O,[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,*C(*)=O.*NCNCCCCCC(=O)*
7,[1*]C(=O)C(C)[At],[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,*C(=O)C(C)*.*NCNCCCCCC(=O)*
8,[1*]C(=O)CCCCCCCCCCC(=O)[At],[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,*C(=O)CCCCCCCCCCC(=O)*.*NCNCCCCCC(=O)*
9,[1*]C(=O)C([4*])C,[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,*C(=O)C(*)C.*NCNCCCCCC(=O)*


In [66]:
# ensure at_valid_poly "smiles" column should have unique values only
at_valid_poly = at_valid_poly.drop_duplicates(subset=["smiles"])

In [68]:
at_valid_poly.shape

(20447, 7)

In [70]:
# take smiles column at the frist place in the df
at_valid_poly = at_valid_poly[["smiles"] + [col for col in at_valid_poly.columns if col != "smiles"]]

In [71]:
at_valid_poly.head()

Unnamed: 0,smiles,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro
3,*NCN*.*CCCCCC(=O)*,[5*]NCN[5*],[4*]CCCCCC(=O)[At],[4*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
5,*C(*)=O.*NCNCCCCCC(=O)*,[1*]C([6*])=O,[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
7,*C(=O)C(C)*.*NCNCCCCCC(=O)*,[1*]C(=O)C(C)[At],[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
8,*C(=O)CCCCCCCCCCC(=O)*.*NCNCCCCCC(=O)*,[1*]C(=O)CCCCCCCCCCC(=O)[At],[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
9,*C(=O)C(*)C.*NCNCCCCCC(=O)*,[1*]C(=O)C([4*])C,[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...


In [72]:
at_valid_poly.to_csv("./finals/dfs/at_valid_poly.csv", index=False)

In [73]:
smiles_df.head()

Unnamed: 0,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro
0,[1*]C(C)=O,[3*]O[3*],[1*][*:1].[3*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[3*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1,[1*]C(=O)C=C,[3*]O[3*],[1*][*:1].[3*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[3*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
2,[1*]C(=O)C(=C)C,[3*]O[3*],[1*][*:1].[3*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[3*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
3,[16*]c1ccc(N=C=O)cc1,[3*]O[3*],[16*][*:2].[3*][*:1]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[3*]-[*:1].[16*]-[*:2]>>[$([O&D2]-&!@[#0,#6,#1...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
4,[16*]c1ccc([16*])cc1,[3*]O[3*],[16*][*:2].[3*][*:1]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[3*]-[*:1].[16*]-[*:2]>>[$([O&D2]-&!@[#0,#6,#1...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...


In [78]:
smiles_df = wildcard_converter(smiles_df)

In [80]:
smiles_non_single_atom_df = smiles_df[smiles_df["smiles"].apply(filter_non_single_atom)]

In [82]:
smiles_valid_poly = smiles_non_single_atom_df[smiles_non_single_atom_df["smiles"].apply(filter_exact_two_wildcards)]

In [84]:
# ensure at_valid_poly "smiles" column should have unique values only
smiles_valid_poly = smiles_valid_poly.drop_duplicates(subset=["smiles"])

In [86]:
# take smiles column at the frist place in the df
smiles_valid_poly = smiles_valid_poly[["smiles"] + [col for col in smiles_valid_poly.columns if col != "smiles"]]

In [87]:
smiles_valid_poly.head()

Unnamed: 0,smiles,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro
337,*NCN*.*Oc1ccc(*)cc1,[5*]NCN[5*],[3*]Oc1ccc([16*])cc1,[16*][*:2].[5*][*:1]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[5*]-[*:1].[16*]-[*:2]>>[$([N&!D1&!$(N=*)&!$(N...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
340,*c1ccc(*)cc1.*Oc1ccc(*)cc1,[16*]c1ccc([16*])cc1,[3*]Oc1ccc([16*])cc1,[16*][*:1].[16*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[16*]-[*:1].[16*]-[*:2]>>[$([c&$(c(:c):c)]):1]...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
345,*CCC*.*Oc1ccc(*)cc1,[4*]CCC[4*],[3*]Oc1ccc([16*])cc1,[3*][*:1].[4*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[3*]-[*:1].[4*]-[*:2]>>[$([O&D2]-&!@[#0,#6,#1]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
379,*c1ccc(*)cc1.*CCCOc1ccc(*)cc1,[16*]c1ccc([16*])cc1,[4*]CCCOc1ccc([16*])cc1,[16*][*:1].[16*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[16*]-[*:1].[16*]-[*:2]>>[$([c&$(c(:c):c)]):1]...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
385,*NCN*.*CCCOc1ccc(*)cc1,[5*]NCN[5*],[4*]CCCOc1ccc([16*])cc1,[4*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...


In [88]:
smiles_valid_poly.shape

(265, 7)

In [89]:
smiles_valid_poly.to_csv("./finals/dfs/smiles_valid_poly.csv", index=False)

In [90]:
raw_df.head()

Unnamed: 0,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro
0,*O[3*],[1*]C(C)=O,[1*][*:1].[3*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[3*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1,[3*]O[3*],[1*]C(C)=O,[1*][*:1].[3*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[3*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
2,*N[5*],[1*]C(C)=O,[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
3,[5*]N[5*],[1*]C(C)=O,[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
4,[5*]NCN[5*],[1*]C(C)=O,[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...


In [91]:
raw_df = wildcard_converter(raw_df)

In [93]:
raw_non_single_atom_df = raw_df[raw_df["smiles"].apply(filter_non_single_atom)]

In [95]:
raw_valid_poly = raw_non_single_atom_df[raw_non_single_atom_df["smiles"].apply(filter_exact_two_wildcards)]

In [97]:
# ensure at_valid_poly "smiles" column should have unique values only
raw_valid_poly = raw_valid_poly.drop_duplicates(subset=["smiles"])

In [99]:
# take smiles column at the frist place in the df
raw_valid_poly = raw_valid_poly[["smiles"] + [col for col in raw_valid_poly.columns if col != "smiles"]]

In [100]:
raw_valid_poly.head()

Unnamed: 0,smiles,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro
1934,*NCN*.*Cc1ccc(*)cc1,[5*]NCN[5*],[8*]Cc1ccc([16*])cc1,[16*][*:2].[5*][*:1]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[5*]-[*:1].[16*]-[*:2]>>[$([N&!D1&!$(N=*)&!$(N...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1939,*c1ccc(*)cc1.*Cc1ccc(*)cc1,[16*]c1ccc([16*])cc1,[8*]Cc1ccc([16*])cc1,[16*][*:2].[8*][*:1]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[8*]-[*:1].[16*]-[*:2]>>[$([C&!R&!D1&!$(C!-*)]...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1942,*C(*)=O.*Cc1ccc(*)cc1,[1*]C([6*])=O,[8*]Cc1ccc([16*])cc1,[16*][*:2].[6*][*:1]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[6*]-[*:1].[16*]-[*:2]>>[$([C&D3&!R](=O)-&!@[#...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1946,*c1ccc(*)cc1.*C(=O)c1ccc(C*)cc1,[16*]c1ccc([16*])cc1,[1*]C(=O)c1ccc(C[8*])cc1,[16*][*:2].[8*][*:1]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[8*]-[*:1].[16*]-[*:2]>>[$([C&!R&!D1&!$(C!-*)]...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1951,*NCN*.*C(=O)c1ccc(C*)cc1,[5*]NCN[5*],[1*]C(=O)c1ccc(C[8*])cc1,[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...


In [101]:
raw_valid_poly.shape

(14150, 7)

In [102]:
raw_valid_poly.to_csv("./finals/dfs/raw_valid_poly.csv", index=False)

In [103]:
# merge three dataframes into one
valid_poly = pd.concat([at_valid_poly, smiles_valid_poly, raw_valid_poly], ignore_index=True)

In [104]:
valid_poly.head()

Unnamed: 0,smiles,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro
0,*NCN*.*CCCCCC(=O)*,[5*]NCN[5*],[4*]CCCCCC(=O)[At],[4*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1,*C(*)=O.*NCNCCCCCC(=O)*,[1*]C([6*])=O,[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
2,*C(=O)C(C)*.*NCNCCCCCC(=O)*,[1*]C(=O)C(C)[At],[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
3,*C(=O)CCCCCCCCCCC(=O)*.*NCNCCCCCC(=O)*,[1*]C(=O)CCCCCCCCCCC(=O)[At],[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
4,*C(=O)C(*)C.*NCNCCCCCC(=O)*,[1*]C(=O)C([4*])C,[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...


In [105]:
valid_poly.shape

(34862, 7)

In [106]:
# ensure at_valid_poly "smiles" column should have unique values only
main_valid_poly = valid_poly.drop_duplicates(subset=["smiles"])

In [107]:
main_valid_poly.shape

(28624, 7)

In [110]:
# in the "smiles" column there are two components separated by a dot (.) like ABC.XYZ
# if you filter for unique values in the "smiles" column, you will get the unique values only but hypothetically ABC.XYZ AND XYZ.ABC are smae value
# define a filter to ensure the dot separated values are unique for all
main_valid_poly["smiles"] = main_valid_poly["smiles"].apply(lambda x: ".".join(sorted(x.split("."))))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_valid_poly["smiles"] = main_valid_poly["smiles"].apply(lambda x: ".".join(sorted(x.split("."))))


In [111]:
# ensure at_valid_poly "smiles" column should have unique values only
main_valid_poly = valid_poly.drop_duplicates(subset=["smiles"])

In [112]:
main_valid_poly.shape

(28624, 7)

In [113]:
main_valid_poly.to_csv("./finals/dfs/main_valid_poly.csv", index=False)

In [5]:
import pandas as pd

In [6]:
main_df = pd.read_csv("./finals/dfs/main_valid_poly.csv")
main_df.head()

Unnamed: 0,smiles,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro
0,*NCN*.*CCCCCC(=O)*,[5*]NCN[5*],[4*]CCCCCC(=O)[At],[4*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1,*C(*)=O.*NCNCCCCCC(=O)*,[1*]C([6*])=O,[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
2,*C(=O)C(C)*.*NCNCCCCCC(=O)*,[1*]C(=O)C(C)[At],[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
3,*C(=O)CCCCCCCCCCC(=O)*.*NCNCCCCCC(=O)*,[1*]C(=O)CCCCCCCCCCC(=O)[At],[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
4,*C(=O)C(*)C.*NCNCCCCCC(=O)*,[1*]C(=O)C([4*])C,[5*]NCNCCCCCC(=O)[At],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...


In [7]:
# replace At with * in fragment and seed columns
main_df["fragment"] = main_df["fragment"].str.replace("[At]", "[*]", regex=False)
main_df["seed"] = main_df["seed"].str.replace("[At]", "[*]", regex=False)

In [8]:
main_df.head()

Unnamed: 0,smiles,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro
0,*NCN*.*CCCCCC(=O)*,[5*]NCN[5*],[4*]CCCCCC(=O)[*],[4*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
1,*C(*)=O.*NCNCCCCCC(=O)*,[1*]C([6*])=O,[5*]NCNCCCCCC(=O)[*],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
2,*C(=O)C(C)*.*NCNCCCCCC(=O)*,[1*]C(=O)C(C)[*],[5*]NCNCCCCCC(=O)[*],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
3,*C(=O)CCCCCCCCCCC(=O)*.*NCNCCCCCC(=O)*,[1*]C(=O)CCCCCCCCCCC(=O)[*],[5*]NCNCCCCCC(=O)[*],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...
4,*C(=O)C(*)C.*NCNCCCCCC(=O)*,[1*]C(=O)C([4*])C,[5*]NCNCCCCCC(=O)[*],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...


In [9]:
# create an indexed_smiles column by joining fragment and seed columns with a dot (.)
main_df["indexed_smiles"] = main_df["fragment"] + "." + main_df["seed"]

In [10]:
# create an indexed_smiles_rxn_smiles column by joining indexed_smiles and reaction_smiles columns with a |
main_df["indexed_smiles_rxn_smiles"] = main_df["indexed_smiles"] + "|" + main_df["reaction_smiles"]

In [11]:
# create an indexed_smiles_rxn_smarts column by joining indexed_smiles and reaction_smarts columns with a |
main_df["indexed_smiles_rxn_smarts"] = main_df["indexed_smiles"] + "|" + main_df["reaction_smarts"]

In [12]:
# create a column "complete_data" by joining indexed_smiles_rxn_smiles and reaction_smarts columns with a |
main_df["complete_data"] = main_df["indexed_smiles_rxn_smiles"] + "|" + main_df["reaction_smarts"]

In [15]:
main_df["complete_data"][0]

'[5*]NCN[5*].[4*]CCCCCC(=O)[*]|[4*][*:1].[5*][*:2]>>[*:1][*:2] |[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[#6]):1]-&!@[$([N&!D1&!$(N=*)&!$(N-[!#6&!#16&!#0&!#1])&!$([N&R]@[C&R]=O)]):2] '

In [16]:
main_df.to_csv("./finals/dfs/master_df.csv", index=False)

In [None]:
alternate_outro = "<1-3:0.5:0.5<1-4:0.5:0.5<2-3:0.5:0.5<2-4:0.5:0.5"
block_outro = "<1-2:0.375:0.375<1-1:0.375:0.375<2-2:0.375:0.375<3-4:0.375:0.375<3-3:0.375:0.375<4-4:0.125:0.125<1-3:0.125:0.125<1-4:0.125:0.125<2-3:0.125:0.125<2-4:0.125:0.125"
random_outro = "<1-3:0.25:0.25<1-4:0.25:0.25<2-3:0.25:0.25<2-4:0.25:0.25<1-2:0.25:0.25<3-4:0.25:0.25<1-1:0.25:0.25<2-2:0.25:0.25<3-3:0.25:0.25<4-4:0.25:0.25" 

In [18]:
mid_var = ["|0.5|0.5|", "|0.25|0.75|", "|0.75|0.25|"]

In [20]:
def convert_indexed(str_value):
    """
    *C(*)=O.*NCNCCCCCC(=O)*	 => [*:1]C([*:2])=O.[*:3]NCNCCCCCC(=O)[*:4]	
    """
    count = 1
    new_val = ""
    for s in str_value:
        if s == "*":
            new_val += f"[*:{count}]"
            count += 1
        else:
            new_val += s
    return new_val

In [25]:
dummy_df = main_df.copy()

In [26]:
dummy_df["smiles"] = dummy_df["smiles"].apply(convert_indexed)

In [27]:
dummy_df.head()

Unnamed: 0,smiles,fragment,seed,reaction_smiles,reaction_smiles_outro,reaction_smarts,reaction_smarts_outro,indexed_smiles,indexed_smiles_rxn_smiles,indexed_smiles_rxn_smarts,complete_data
0,[*:1]NCN[*:2].[*:3]CCCCCC(=O)[*:4],[5*]NCN[5*],[4*]CCCCCC(=O)[*],[4*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[4*]-[*:1].[5*]-[*:2]>>[$([C&!D1&!$(C=*)]-&!@[...,atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[5*]NCN[5*].[4*]CCCCCC(=O)[*],[5*]NCN[5*].[4*]CCCCCC(=O)[*]|[4*][*:1].[5*][*...,[5*]NCN[5*].[4*]CCCCCC(=O)[*]|[4*]-[*:1].[5*]-...,[5*]NCN[5*].[4*]CCCCCC(=O)[*]|[4*][*:1].[5*][*...
1,[*:1]C([*:2])=O.[*:3]NCNCCCCCC(=O)[*:4],[1*]C([6*])=O,[5*]NCNCCCCCC(=O)[*],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[1*]C([6*])=O.[5*]NCNCCCCCC(=O)[*],[1*]C([6*])=O.[5*]NCNCCCCCC(=O)[*]|[1*][*:1].[...,[1*]C([6*])=O.[5*]NCNCCCCCC(=O)[*]|[1*]-[*:1]....,[1*]C([6*])=O.[5*]NCNCCCCCC(=O)[*]|[1*][*:1].[...
2,[*:1]C(=O)C(C)[*:2].[*:3]NCNCCCCCC(=O)[*:4],[1*]C(=O)C(C)[*],[5*]NCNCCCCCC(=O)[*],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[1*]C(=O)C(C)[*].[5*]NCNCCCCCC(=O)[*],[1*]C(=O)C(C)[*].[5*]NCNCCCCCC(=O)[*]|[1*][*:1...,[1*]C(=O)C(C)[*].[5*]NCNCCCCCC(=O)[*]|[1*]-[*:...,[1*]C(=O)C(C)[*].[5*]NCNCCCCCC(=O)[*]|[1*][*:1...
3,[*:1]C(=O)CCCCCCCCCCC(=O)[*:2].[*:3]NCNCCCCCC(...,[1*]C(=O)CCCCCCCCCCC(=O)[*],[5*]NCNCCCCCC(=O)[*],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[1*]C(=O)CCCCCCCCCCC(=O)[*].[5*]NCNCCCCCC(=O)[*],[1*]C(=O)CCCCCCCCCCC(=O)[*].[5*]NCNCCCCCC(=O)[...,[1*]C(=O)CCCCCCCCCCC(=O)[*].[5*]NCNCCCCCC(=O)[...,[1*]C(=O)CCCCCCCCCCC(=O)[*].[5*]NCNCCCCCC(=O)[...
4,[*:1]C(=O)C([*:2])C.[*:3]NCNCCCCCC(=O)[*:4],[1*]C(=O)C([4*])C,[5*]NCNCCCCCC(=O)[*],[1*][*:1].[5*][*:2]>>[*:1][*:2],atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,"[1*]-[*:1].[5*]-[*:2]>>[$([C&D3]([#0,#6,#7,#8]...",atomProp:1.molAtomMapNumber.1:3.molAtomMapNumb...,[1*]C(=O)C([4*])C.[5*]NCNCCCCCC(=O)[*],[1*]C(=O)C([4*])C.[5*]NCNCCCCCC(=O)[*]|[1*][*:...,[1*]C(=O)C([4*])C.[5*]NCNCCCCCC(=O)[*]|[1*]-[*...,[1*]C(=O)C([4*])C.[5*]NCNCCCCCC(=O)[*]|[1*][*:...


In [28]:
values = dummy_df["smiles"].values

In [29]:
wdg_master = []
for value in values:
    for mid in mid_var:
        for outro in [alternate_outro, block_outro, random_outro]:
            wdg_master.append(value + mid + outro)

In [30]:
wdg_master = set(wdg_master)  # ensure unique values

In [31]:
len(wdg_master)

257616

In [32]:
wdg_df = pd.DataFrame(wdg_master, columns=["smiles"])

In [35]:
wdg_df["smiles"][0]

'[*:1]C(=O)[*:2].[*:3]CCCNC(C)C(=O)NCN[*:4]|0.5|0.5||0.5|0.5|<1-3:0.25:0.25<1-4:0.25:0.25<2-3:0.25:0.25<2-4:0.25:0.25<1-2:0.25:0.25<3-4:0.25:0.25<1-1:0.25:0.25<2-2:0.25:0.25<3-3:0.25:0.25<4-4:0.25:0.25'

In [36]:
wdg_df.to_csv("./finals/dfs/wdg_df_master.csv", index=False)