# 1. Train test split

In [13]:
import sys
sys.path.append('../../')
import pandas as pd
from SynTemp.SynUtils.utils import load_database, save_database
data = pd.DataFrame(load_database('../../Data/DPO/USPTO_50K/USPTO_50K_aam_reactions.json.gz'))
original_data = pd.read_csv('../../Data/USPTO_50K/USPTO_50K_original.csv')

data = pd.concat([data, original_data['class']], axis=1)

In [19]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, random_state=42, stratify=data['class'], test_size=0.2)

save_database(train.to_dict('records'), '../../Data/DPO/USPTO_50K/train.json.gz')
save_database(test.to_dict('records'), '../../Data/DPO/USPTO_50K/test.json.gz')

# 2. Hydrogen adjustments

In [22]:
from SynTemp.SynUtils.utils import load_from_pickle
data = load_from_pickle('../../Data/DPO/USPTO_50K/Hydrogen/USPTO_50K_its_graph_rules_cluster.pkl.gz')

In [31]:
from SynTemp.SynUtils.utils import stratified_random_sample
samples = stratified_random_sample(data, 'naive_cluster', 1, 42)

In [36]:
pd.DataFrame(samples)['Reaction Type'].value_counts()

Reaction Type
Single Cyclic     119
Acyclic            60
None               57
Complex Cyclic     31
Name: count, dtype: int64

In [41]:
samples[0]

{'R-id': 31123,
 'ITSGraph': (<networkx.classes.graph.Graph at 0x7fc5d070ee90>,
  <networkx.classes.graph.Graph at 0x7fc5d071d490>,
  <networkx.classes.graph.Graph at 0x7fc5d071fa10>),
 'GraphRules': (<networkx.classes.graph.Graph at 0x7fc5d0730a10>,
  <networkx.classes.graph.Graph at 0x7fc5d0730e50>,
  <networkx.classes.graph.Graph at 0x7fc5d0731290>),
 'naive_cluster': 0,
 'Reaction Type': 'Single Cyclic',
 'Rings': [4]}

In [39]:
from SynTemp.SynUtils.utils import stratified_random_sample

samples = stratified_random_sample(data, 'naive_cluster', 1, 42)
samples_good = [value for value in samples if value['Reaction Type'] in ['Single Cyclic', 'Complex Cyclic']]

In [43]:
from SynTemp.SynRule.rule_writing import RuleWriting
rules = RuleWriting.auto_extraction(data_dicts=samples, id_column='naive_cluster',
                                    save_path='../../Data/DPO/USPTO_50K/Hydrogen/Rules')

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 220 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 267 out of 267 | elapsed:    0.1s finished


In [44]:
from SynTemp.SynRule.rule_writing import RuleWriting
rules_good = RuleWriting.auto_extraction(data_dicts=samples, id_column='naive_cluster',
                                    save_path='../../Data/DPO/USPTO_50K/Hydrogen/Rules_good')

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 205 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 267 out of 267 | elapsed:    0.1s finished


In [45]:
from SynTemp.SynRule.rule_benchmark import RuleBenchmark
from SynTemp.SynChemistry.sf_similarity import SFSimilarity
from SynTemp.SynChemistry.sf_maxfrag import SFMaxFrag

In [46]:
import logging
import pandas as pd

logging.basicConfig(filename=f'../../Data/DPO/USPTO_50K/Hydrogen/topk_accuracy_good.log', level=logging.INFO, format='%(asctime)s - %(message)s')


In [47]:
# Set the parameters for the experiment
top_k_values = [1, 3, 5, 10]
scoring_functions = {
    'MaxFrag': SFMaxFrag(),
    'ECFP6': SFSimilarity(["ECFP6"]),
    'MACCS': SFSimilarity(["MACCS"]),
    'RDK7': SFSimilarity(["RDK7"])
}

# Prepare DataFrame to store results
    results_list = []

# Run benchmark for each scoring function and Top K
fw, bw = RuleBenchmark.reproduce_reactions(
    database=database,
    id_col="R-id",
    rule_file_path=f"{root_dir}/Data/DPO/uspto/Rule",
    original_rsmi_col="reactions",
    repeat_times=1,
    prior=False,

In [None]:
import sys
from pathlib import Path

root_dir = Path(__file__).parents[2]
sys.path.append(str(root_dir))
from SynTemp.SynUtils.utils import load_database
from SynTemp.SynRule.rule_benchmark import RuleBenchmark
from SynTemp.SynChemistry.sf_similarity import SFSimilarity
from SynTemp.SynChemistry.sf_maxfrag import SFMaxFrag

if __name__ == "__main__":
    import logging
    import pandas as pd

    # Set up logging
    logging.basicConfig(filename=f'{root_dir}/Docs/Notebook/topk_accuracy.log', level=logging.INFO, format='%(asctime)s - %(message)s')

    # Load the database
    database = load_database(f"{root_dir}/Data/DPO/uspto/demo_database.json.gz")

    # Set the parameters for the experiment
    top_k_values = [1, 3, 5, 10]
    scoring_functions = {
        'MaxFrag': SFMaxFrag(),
        'ECFP6': SFSimilarity(["ECFP6"]),
        'MACCS': SFSimilarity(["MACCS"]),
        'RDK7': SFSimilarity(["RDK7"])
    }

    # Prepare DataFrame to store results
    results_list = []

    # Run benchmark for each scoring function and Top K
    fw, bw = RuleBenchmark.reproduce_reactions(
        database=database,
        id_col="R-id",
        rule_file_path=f"{root_dir}/Data/DPO/uspto/Rule",
        original_rsmi_col="reactions",
        repeat_times=1,
        prior=False,
    )

    for name, func in scoring_functions.items():
        for k in top_k_values:
            accuracy = RuleBenchmark.TopKAccuracy(
                fw,
                "reactions",
                "ranked_reactions",
                k,
                ignore_stero=True,
                scoring_function=func,
            )
            log_message = f"Top {k} accuracy for {name}: {accuracy}"
            print(log_message)
            logging.info(log_message)
            
            # Append results to the list
            results_list.append({'Scoring Function': name, 'Top K': f'Top {k}', 'Accuracy': accuracy})

    # Convert list to DataFrame
    results_df = pd.DataFrame(results_list)

    # Pivot the DataFrame to get the desired layout
    pivot_df = results_df.pivot(index='Scoring Function', columns='Top K', values='Accuracy')

    # # Save results to CSV
    # pivot_df.to_csv('topk_accuracy_matrix.csv')
    # logging.info("Results matrix saved to topk_accuracy_matrix.csv")

    # Log the pivot table
    pivot_log = pivot_df.to_string()
    logging.info("Results Matrix:\n" + pivot_log)
