# **1. CGRTool**

In [None]:
import sys

sys.path.append("../../")
from syntemp.SynUtils.utils import load_database, save_database
import pandas as pd

uspto_3k = pd.read_csv("../../Data/AAM/aam_benchmark/USPTO_sampled.csv")

## 1.1 Generate unbias ground truth

In [None]:
import pandas as pd
import re


def add_ground_truth_column(df):
    # Precompile the regex pattern outside of the function to avoid recompilation
    atom_map_pattern = re.compile(r":\d+")

    # Helper function to count atom mappings
    def count_atom_mappings(smiles_string):
        if pd.isna(smiles_string):
            return 0
        mappings = atom_map_pattern.findall(smiles_string)
        return len(set(mappings))

    # Vectorize the counting of atom mappings for each mapper based on the correct flag
    for mapper in ["RXNMapper", "GraphMapper", "LocalMapper"]:
        df[mapper + "_count"] = df.apply(
            lambda x: count_atom_mappings(x[mapper]) if x[mapper + "_correct"] else 0,
            axis=1,
        )

    # Function to determine the ground truth based on the highest count of atom mappings
    def determine_ground_truth(row):
        # Collect counts and corresponding SMILES strings if the count is positive
        mapping_data = {
            "RXNMapper": (row["RXNMapper_count"], row["RXNMapper"]),
            "GraphMapper": (row["GraphMapper_count"], row["GraphMapper"]),
            "LocalMapper": (row["LocalMapper_count"], row["LocalMapper"]),
        }
        # Select the mapper with the highest non-zero count
        max_mapper, (max_count, max_smiles) = max(
            mapping_data.items(), key=lambda x: x[1][0], default=(None, (0, None))
        )
        # Return the SMILES string of the mapper with the highest count or None if all counts are zero
        return max_smiles if max_count > 0 else None

    # Apply the function to determine the ground truth for each row
    df["Ground Truth"] = df.apply(determine_ground_truth, axis=1)
    return df

In [None]:
df = add_ground_truth_column(uspto_3k)
df.to_csv("../../Data/AAM/cgrtool_benchmark/USPTO_3K.csv", index=False)

## 1.2. Benchmark with CGRTool

In [31]:
df_u1 = pd.read_csv(
    "../../Data/AAM/cgrtool_benchmark/uspto_3k_cgrtool_old.csv", index_col=0
)
df_u2 = pd.read_csv(
    "../../Data/AAM/cgrtool_benchmark/uspto_3k_cgrtool_new.csv", index_col=0
)

In [32]:
df_u1 = df_u1[
    [
        "Ground Truth",
        "RXNMapper_correct",
        "GraphMapper_correct",
        "LocalMapper_correct",
        "CGRTool_rxnmapper",
        "CGRTool_graphmapper",
        "CGRTool_localmapper",
    ]
]

df_u2 = df_u2[
    [
        "Ground Truth",
        "RXNMapper_correct",
        "GraphMapper_correct",
        "LocalMapper_correct",
        "CGRTool_rxnmapper",
        "CGRTool_graphmapper",
        "CGRTool_localmapper",
    ]
]

In [33]:
ground_data = pd.DataFrame(
    [
        {
            "RXNMapper": round(
                100 * df_u1["RXNMapper_correct"].sum() / len(df_u1), 2
            ),
            "Graphormer": round(
                100 * df_u1["GraphMapper_correct"].sum() / len(df_u1), 2
            ),
            "LocalMapper": round(
                100 * df_u1["LocalMapper_correct"].sum() / len(df_u1), 2
            ),
        }
    ]
).T
ground_data.rename(columns={0: "Ground Truth (%)"}, inplace=True)
ground_data

Unnamed: 0,Ground Truth (%)
RXNMapper,93.53
Graphormer,95.1
LocalMapper,100.0


In [None]:
cgrtool_old = pd.DataFrame(
    [
        {
            "RXNMapper": round(
                100 * df_u1["CGRTool_rxnmapper"].sum() / len(df_u1), 2
            ),
            "Graphormer": round(
                100 * df_u1["CGRTool_graphmapper"].sum() / len(df_u1), 2
            ),
            "LocalMapper": round(
                100 * df_u1["CGRTool_localmapper"].sum() / len(df_u1), 2
            ),
        }
    ]
).T

cgrtool_old.rename(columns={0: "CGRTools 1 (%)"}, inplace=True)
cgrtool_old

In [None]:
cgrtool_new = pd.DataFrame(
    [
        {
            "RXNMapper": round(
                100 * df_new["CGRTool_rxnmapper"].sum() / len(df_new), 2
            ),
            "Graphormer": round(
                100 * df_new["CGRTool_graphmapper"].sum() / len(df_new), 2
            ),
            "LocalMapper": round(
                100 * df_new["CGRTool_localmapper"].sum() / len(df_new), 2
            ),
        }
    ]
).T

cgrtool_new.rename(columns={0: "CGRTools 2 (%)"}, inplace=True)
cgrtool_new

In [None]:
cgr_data = pd.concat([ground_data, cgrtool_old, cgrtool_new], axis=1)
cgr_data

In [None]:
from syntemp.SynAAM.aam_validator import AAMValidator

df_old = pd.read_csv(
    "../../Data/AAM/cgrtool_benchmark/uspto_3k_cgrtool_old.csv", index_col=0
)
df_new = pd.read_csv(
    "../../Data/AAM/cgrtool_benchmark/uspto_3k_cgrtool_new.csv", index_col=0
)
results_old_its = AAMValidator.validate_smiles(
    data=df_old,
    ground_truth_col="GroundTruth",
    mapped_cols=["RXNMapper", "GraphMapper", "LocalMapper"],
    check_method="ITS",
    ignore_aromaticity=False,
    n_jobs=4,
    verbose=0,
    ensemble=False,
    strategies=[["rxn_mapper", "graphormer", "local_mapper"]],
    ignore_tautomers=False,
)


results_new_its = AAMValidator.validate_smiles(
    data=df_new,
    ground_truth_col="GroundTruth",
    mapped_cols=["RXNMapper", "GraphMapper", "LocalMapper"],
    check_method="ITS",
    ignore_aromaticity=False,
    n_jobs=4,
    verbose=0,
    ensemble=False,
    strategies=[["rxn_mapper", "graphormer", "local_mapper"]],
    ignore_tautomers=False,
)

In [None]:
df_old = pd.read_csv(
    "../../Data/AAM/cgrtool_benchmark/uspto_3k_cgrtool_old.csv", index_col=0
)
df_new = pd.read_csv(
    "../../Data/AAM/cgrtool_benchmark/uspto_3k_cgrtool_new.csv", index_col=0
)
results_old = AAMValidator.validate_smiles(
    data=df_old,
    ground_truth_col="GroundTruth",
    mapped_cols=["RXNMapper", "GraphMapper", "LocalMapper"],
    check_method="RC",
    ignore_aromaticity=False,
    n_jobs=4,
    verbose=0,
    ensemble=False,
    strategies=[["rxn_mapper", "graphormer", "local_mapper"]],
    ignore_tautomers=False,
)


results_new = AAMValidator.validate_smiles(
    data=df_new,
    ground_truth_col="GroundTruth",
    mapped_cols=["RXNMapper", "GraphMapper", "LocalMapper"],
    check_method="RC",
    ignore_aromaticity=False,
    n_jobs=4,
    verbose=0,
    ensemble=False,
    strategies=[["rxn_mapper", "graphormer", "local_mapper"]],
    ignore_tautomers=False,
)

In [None]:
import numpy as np

np.sum(results_new_its[0][0]["results"])

In [None]:
np.sum(results_new[0][0]["results"])

In [None]:
np.sum(results_old[0][0]["results"])

In [None]:
pd.DataFrame(results_new[0])

In [None]:
pd.DataFrame(results_old[0])

In [None]:
pd.DataFrame(results_new_its[0][0]["results"]) != pd.DataFrame(
    results_new[0][0]["results"]
)

In [None]:
wrong_index = []
for key, value in enumerate(results_new[0][0]["results"]):
    if value != results_new_its[0][0]["results"][key]:
        print(value)
        wrong_index.append(key)

In [None]:
wrong_index

In [None]:
results_new[0][0]["results"]

In [None]:
aam_new = pd.DataFrame(results_new[0])[["mapper", "accuracy"]]
aam_new["mapper"][1] = "Graphormer"

In [None]:
aam_new = pd.DataFrame(results_new[0])[["mapper", "accuracy"]]
aam_new["mapper"][1] = "Graphormer"
aam_new.index = aam_new["mapper"].tolist()
aam_new.drop(["mapper"], axis=1, inplace=True)
aam_new.rename(columns={"accuracy": "SynTemp 2 (%)"}, inplace=True)

In [None]:
aam_new

In [None]:
aam_old = pd.DataFrame(results_old[0])[["mapper", "accuracy"]]
aam_old["mapper"][1] = "Graphormer"
aam_old.index = aam_old["mapper"].tolist()
aam_old.drop(["mapper"], axis=1, inplace=True)
aam_old.rename(columns={"accuracy": "SynTemp 1 (%)"}, inplace=True)

In [None]:
benchmark = pd.concat([cgr_data, aam_old, aam_new], axis=1)

In [None]:
benchmark

### 1.2.2. EEquaam

In [None]:
from syntemp.SynChemistry.balance_checker import BalanceReactionCheck

df_old = pd.read_csv(
    "../../Data/AAM/cgrtool_benchmark/uspto_3k_cgrtool_old.csv", index_col=0
)
df_new = pd.read_csv(
    "../../Data/AAM/cgrtool_benchmark/uspto_3k_cgrtool_new.csv", index_col=0
)
check_balance = BalanceReactionCheck()
df_new_balance, _ = check_balance.dicts_balance_check(
    df_new.to_dict("records"), "GroundTruth"
)

df_old_balance, _ = check_balance.dicts_balance_check(
    df_old.to_dict("records"), "GroundTruth"
)

In [None]:
results_old_aam, _ = AAMValidator.validate_smiles(
    data=df_old_balance,
    ground_truth_col="GroundTruth",
    mapped_cols=["RXNMapper", "GraphMapper", "LocalMapper"],
    check_method="RC",
    ignore_aromaticity=False,
    n_jobs=4,
    verbose=0,
    ensemble=False,
    strategies=[["rxn_mapper", "graphormer", "local_mapper"]],
    ignore_tautomers=False,
)


results_new_aam, _ = AAMValidator.validate_smiles(
    data=df_new_balance,
    ground_truth_col="GroundTruth",
    mapped_cols=["RXNMapper", "GraphMapper", "LocalMapper"],
    check_method="RC",
    ignore_aromaticity=False,
    n_jobs=4,
    verbose=0,
    ensemble=False,
    strategies=[["rxn_mapper", "graphormer", "local_mapper"]],
    ignore_tautomers=False,
)

In [30]:
pd.DataFrame(results_old_aam)

NameError: name 'results_old_aam' is not defined

In [None]:
pd.DataFrame(results_new_aam)

In [None]:
results_old_eqquaam, _ = AAMValidator.validate_smiles(
    data=df_old_balance,
    ground_truth_col="GroundTruth",
    mapped_cols=["RXNMapper", "GraphMapper", "LocalMapper"],
    check_method="ITS",
    ignore_aromaticity=False,
    n_jobs=4,
    verbose=0,
    ensemble=False,
    strategies=[["rxn_mapper", "graphormer", "local_mapper"]],
    ignore_tautomers=True,
)


results_new_eqquaam, _ = AAMValidator.validate_smiles(
    data=df_new_balance,
    ground_truth_col="GroundTruth",
    mapped_cols=["RXNMapper", "GraphMapper", "LocalMapper"],
    check_method="ITS",
    ignore_aromaticity=False,
    n_jobs=4,
    verbose=0,
    ensemble=False,
    strategies=[["rxn_mapper", "graphormer", "local_mapper"]],
    ignore_tautomers=True,
)

In [None]:
pd.DataFrame(results_new_eqquaam)

## 1.3. Analyze difference from Ground Truth

In [None]:
data_check = pd.DataFrame(results_new[0])

In [None]:
list_diff_rxn = []
for key, value in enumerate(df_new["RXNMapper_correct"]):
    if value != data_check["results"][0][key]:
        list_diff_rxn.append(key)

list_diff_graph = []
for key, value in enumerate(df_new["GraphMapper_correct"]):
    if value != data_check["results"][1][key]:
        list_diff_graph.append(key)
print("Differences in RXNMapper:", list_diff_rxn)
print("Differences in GraphMapper:", list_diff_graph)

In [None]:
from syntemp.SynVis.chemical_reaction_visualizer import ChemicalReactionVisualizer

vis = ChemicalReactionVisualizer()
i = 192
display(
    vis.visualize_reaction(
        df_new.loc[i, "GroundTruth"], img_size=(1000, 300), show_atom_map=True
    )
)
display(
    vis.visualize_reaction(
        df_new.loc[i, "RXNMapper"], img_size=(1000, 300), show_atom_map=True
    )
)
print(df_new.loc[i, "RXNMapper_correct"])

In [None]:
i = 2157
display(
    vis.visualize_reaction(
        df_new.loc[i, "GroundTruth"], img_size=(1000, 300), show_atom_map=True
    )
)
display(
    vis.visualize_reaction(
        df_new.loc[i, "RXNMapper"], img_size=(1000, 300), show_atom_map=True
    )
)
print(df_new.loc[i, "RXNMapper_correct"])

In [None]:
df_new.loc[i, "RXNMapper"]

In [None]:
reaction_smiles = [
    df_new.loc[192, "GroundTruth"],
    df_new.loc[192, "RXNMapper"],
    df_new.loc[2157, "GroundTruth"],
    df_new.loc[2157, "RXNMapper"],
]
subtitles = ["A", "B", "C", "D"]

## 1.4. Analyze difference from CGRTool

In [None]:
old_rxn = df_old[df_old["CGRTool_rxnmapper"] != data_check["results"][0]]
old_graph = df_old[df_old["CGRTool_graphmapper"] != data_check["results"][1]]

new_rxn = df_new[df_new["CGRTool_rxnmapper"] != data_check["results"][0]]
new_local = df_new[df_new["CGRTool_localmapper"] != data_check["results"][2]]

In [None]:
def op_results(bool):
    if bool:
        return False
    else:
        return True

In [None]:
data_1 = old_rxn[["RXNMapper", "CGRTool_rxnmapper", "GroundTruth"]]
data_1.rename(
    columns={"RXNMapper": "Mapped", "CGRTool_rxnmapper": "CGRTool"}, inplace=True
)
data_1["SynTemp"] = data_1["CGRTool"].apply(op_results)


data_2 = old_graph[["GraphMapper", "CGRTool_graphmapper", "GroundTruth"]]
data_2.rename(
    columns={"GraphMapper": "Mapped", "CGRTool_graphmapper": "CGRTool"}, inplace=True
)
data_2["SynTemp"] = data_2["CGRTool"].apply(op_results)


data_3 = new_rxn[["RXNMapper", "CGRTool_rxnmapper", "GroundTruth"]]
data_3.rename(
    columns={"RXNMapper": "Mapped", "CGRTool_rxnmapper": "CGRTool"}, inplace=True
)
data_3["SynTemp"] = data_3["CGRTool"].apply(op_results)

data_4 = new_local[["LocalMapper", "CGRTool_localmapper", "GroundTruth"]]
data_4.rename(
    columns={"LocalMapper": "Mapped", "CGRTool_localmapper": "CGRTool"}, inplace=True
)
data_4["SynTemp"] = data_4["CGRTool"].apply(op_results)

In [None]:
all_data = pd.concat([data_1, data_2, data_3, data_4], axis=0)
all_data = all_data.drop_duplicates(subset=["Mapped"])
all_data.shape

In [None]:
test = all_data.to_dict("records")

In [None]:
save_database(test, "../../Data/AAM/cgrtool_benchmark/cgr_diff.json.gz")

In [None]:
from synrbl.SynVis import save_reactions_to_pdf

save_reactions_to_pdf(
    test,
    old_reaction_col="GroundTruth",
    new_reaction_col="Mapped",
    pdf_filename="../../Data/AAM/cgrtool_benchmark/cgr_diff.pdf",
    compare=True,
    show_atom_numbers=True,
    orientation="vertical",
)

In [None]:
df_old.iloc[2157, :]["LocalMapper"]