In [1]:
from SynTemp.SynUtils.utils import load_from_pickle
data = load_from_pickle('./Data/DPO/USPTO_50K/Hydrogen/USPTO_50K_its_correct.pkl.gz')

In [2]:
demo_o = data[1000:10000]
demo = [value['GraphRules'][2] for value in demo_o]

In [3]:
small = data[:1000]

In [None]:
import networkx as nx
from typing import List, Set, Dict, Any, Callable, Optional, Tuple
from networkx.algorithms.isomorphism import generic_node_match, generic_edge_match
from operator import eq
from SynTemp.SynUtils.utils import create_unique_value_dict


class RuleCluster:
    def __init__(
        self,
        node_label_names: List[str] = [
            "element",
            "aromatic",
            "charge",
        ],
        node_label_default: List[Any] = ["*", False, 0],
        edge_attribute: str = "order",
    ):
        """
        Initializes the NaiveClusterer with customization options for node and edge matching functions.

        Parameters:
            node_label_names (List[str]): A list of node attribute names to be considered for matching.
            node_label_default (List[Any]): Default values for node attributes, aligned with `node_label_names`.
            edge_attribute (str): The name of the edge attribute to be considered for matching.
        """

        self.nodeLabelNames: List[str] = node_label_names
        self.edgeAttribute: str = edge_attribute
        self.nodeLabelDefault: List[Any] = node_label_default
        self.nodeLabelOperator: List[Callable[[Any, Any], bool]] = [
            eq for _ in node_label_names
        ]
        self.nodeMatch: Callable = generic_node_match(
            self.nodeLabelNames, self.nodeLabelDefault, self.nodeLabelOperator
        )
        self.edgeMatch: Callable = generic_edge_match(self.edgeAttribute, 1, eq)

    @staticmethod
    def auto_cluster(
        graphs: List[nx.Graph], nodeMatch=None, edgeMatch=None
    ) -> List[Set[int]]:
        """
        Clusters the graphs based on isomorphism, using the predefined node and edge match functions.

        Parameters:
            graphs (List[nx.Graph]): A list of NetworkX graph objects to be clustered.

        Returns:
            List[Set[int]]: A list of sets, where each set contains the indices of graphs in the same cluster.
        """
        visited: Set[int] = set()
        clusters = []
        graph_to_cluster = {}

        for i, graph_i in enumerate(graphs):
            if i in visited:
                continue

            cluster: Set[int] = {i}
            visited.add(i)
            graph_to_cluster[i] = len(clusters)

            for j, graph_j in enumerate(graphs[i + 1 :], start=i + 1):
                if j not in visited and nx.is_isomorphic(
                    graph_i,
                    graph_j,
                    node_match=nodeMatch,
                    edge_match=edgeMatch,
                ):
                    cluster.add(j)
                    visited.add(j)
                    graph_to_cluster[j] = len(clusters)

            clusters.append(cluster)

        return clusters, graph_to_cluster

    @staticmethod
    def library_check(
        graphs: List[nx.Graph],
        templates: List[Dict[str, Any]],
        nodeMatch: Optional[callable] = None,
        edgeMatch: Optional[callable] = None,
    ) -> List[Any]:
        """
        Matches each graph in the provided list against a set of template graphs to identify corresponding clusters,
        and avoids further processing for a graph once a match is found.

        Parameters:
            graphs (List[nx.Graph]): A list of NetworkX graph objects to be checked.
            templates (List[Dict[str, Any]]): A list of dictionaries, each containing a template graph ('RC') and associated cluster ID ('ClusterId').
            nodeMatch (callable, optional): A function to match nodes between graphs. Defaults to None.
            edgeMatch (callable, optional): A function to match edges between
            graphs. Defaults to Other.

        Returns:
            List[Any]: A list containing the 'ClusterId' for each graph in 'graphs' if a match is found; otherwise, 'Undefined' for graphs without a match.
        """
        results = []

        for graph in graphs:
            found_match = False

            for template in templates:
                rc = template["RC"]
                if nx.is_isomorphic(
                    graph, rc, node_match=nodeMatch, edge_match=edgeMatch
                ):
                    results.append(template["Cluster_id"])
                    found_match = True
                    break

            if not found_match:
                results.append("Undefined")

        return results

    @staticmethod
    def get_templates(
        graphs: List[nx.Graph],
        graph_to_cluster_dict: Dict[int, int],
        max_index_template: int = 0,
    ) -> List[Dict[str, Any]]:
        """
        Generates a list of templates from graphs based on cluster mappings, offsetting cluster indices by a maximum template index.

        Parameters:
            graphs (List[nx.Graph]): A list of graph objects from which templates are derived.
            graph_to_cluster_dict (Dict[int, int]): A dictionary mapping graph indices to cluster IDs.
            max_index_template (int): The maximum index used to offset cluster IDs.

        Returns:
            List[Dict[str, Any]]: A list of dictionaries each representing a template with a 'Cluster_id' and the associated graph ('RC').
        """
        # Adjust cluster IDs based on the maximum template index
        temp_graph_to_cluster = {
            key: item + max_index_template
            for key, item in graph_to_cluster_dict.items()
        }

        # Create a dictionary with unique values and their first corresponding keys
        unique_temp = create_unique_value_dict(temp_graph_to_cluster)

        # Construct templates using unique cluster IDs and corresponding graphs
        template = [
            {"Cluster_id": key, "RC": graphs[value]}
            for key, value in unique_temp.items()
        ]

        return template

    def fit(
        self,
        graphs: List[nx.Graph],
        templates: Optional[List[Dict[str, Any]]] = None,
        update_template=True,
    ) -> Tuple[List[int], List[Dict[str, Any]]]:
        """
        Automatically clusters the graphs and determines their cluster indices,
        potentially using provided templates for clustering, or generating new templates.

        Parameters:
            graphs (List[nx.Graph]): A list of NetworkX graph objects to determine cluster indices for.
            templates (Optional[List[Dict[str, Any]]]): Optional list of templates used for clustering.
            update_template (bool) : Update new template or not

        Returns:
            tuple:
                List[int]: The list of cluster indices for each graph, aligned with the order of the input list.
                List[Dict[str, Any]]: Updated or newly created list of templates.
        """
        if templates is None:
            # Perform clustering without predefined templates
            _, graph_to_cluster_dict = self.auto_cluster(
                graphs, self.nodeMatch, self.edgeMatch
            )
            templates = self.get_templates(graphs, graph_to_cluster_dict, 0)
            cluster_indices = [graph_to_cluster_dict.get(i) for i in range(len(graphs))]
        else:
            # Use existing templates to check graph clusters
            cluster_indices = RuleCluster.library_check(graphs, templates)
            undefined_keys = [
                i for i, cid in enumerate(cluster_indices) if cid == "Undefined"
            ]
            if update_template:
                undefined_keys = [
                    i for i, cid in enumerate(cluster_indices) if cid == "Undefined"
                ]

                if undefined_keys:
                    # Handle undefined clusters by re-clustering them
                    undefined_graphs = [graphs[i] for i in undefined_keys]
                    _, new_graph_to_cluster_dict = self.auto_cluster(
                        undefined_graphs, self.nodeMatch, self.edgeMatch
                    )
                    max_index_template = (
                        max(t["Cluster_id"] for t in templates) if templates else 0
                    )

                    new_templates = self.get_templates(
                        undefined_graphs, new_graph_to_cluster_dict, max_index_template
                    )
                    templates += new_templates

                    # Update cluster indices for re-clustered graphs
                    new_cluster_indices = [
                        new_graph_to_cluster_dict[i] + max_index_template
                        for i in range(len(undefined_graphs))
                    ]
                    for i, key in enumerate(undefined_keys):
                        cluster_indices[key] = new_cluster_indices[i]

        return cluster_indices, templates


In [5]:
from typing import List, Any, Dict, Tuple
import pandas as pd
import copy
from joblib import Parallel, delayed
from SynTemp.SynRule.rules_extraction import RuleExtraction
from SynTemp.SynRule.rule_cluster import RuleCluster
from SynTemp.SynUtils.graph_utils import check_graph_type, get_cycle_member_rings
import logging

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


class HierarchicalClustering(RuleCluster):
    def __init__(
        self,
        node_label_names: List[str] = [
            "element",
            "aromatic",
            "hcount",
            "charge",
            "typesGH",
        ],
        node_label_default: List[Any] = ["*", False, 0, 0, ()],
        edge_attribute: str = "order",
        max_radius: int = 3,
    ):
        """
        Initializes the HierarchicalClustering with customization options for node and edge matching functions.

        Parameters:
            node_label_names (List[str]): Node attribute names for matching.
            node_label_default (List[Any]): Default values for node attributes.
            edge_attribute (str): Edge attribute name for matching.
            max_radius (int): Maximum number of hierarchical levels.
        """
        super().__init__()
        self.radius = list(range(max_radius + 1))
        self.nodeLabelDefault = node_label_default
        self.nodeLabelNames = node_label_names
        self.edgeAttribute = edge_attribute

    @staticmethod
    def process_level(
        its_graphs: List[Any],
        k: int,
        nodeLabelNames: List[str],
        nodeLabelDefault: Any,
        edgeAttribute: str,
        templates: Dict,
        update_template: bool,
    ) -> Tuple[Dict, Dict]:
        """
        Processes a level in a graph by extracting rules from the input graphs and clustering them.

        Args:
            its_graphs (List[Any]): A list of input graphs to process.
            k (int): The number of nearest neighbors to consider in the k-NN algorithm for rule extraction.
            nodeLabelNames (List[str]): A list of labels for the nodes in the graph.
            nodeCountDefault (Any): The default value for node labels if no label is specified.
            edgeAttribute (str): The attribute name of the edges used in rule extraction.
            templates (Dict): A dictionary of templates used for clustering rules.
            update_template (bool): A flag to determine whether to update the templates after clustering.

        Returns:
            Tuple[Dict, Dict]: A tuple containing the mapping of graphs to clusters and the potentially updated templates.
        """
        logging.info(f"Processing templates with {k}:")
        # Extract reaction rules from graphs, considering k nearest neighbors and extending rules
        rc_graphs = [
            RuleExtraction.extract_reaction_rules(*value, extend=True, n_knn=k)[2]
            for value in its_graphs
        ]

        # Fit the rule clusters with the extracted graphs and templates
        cluster_indices, templates = RuleCluster(
            node_label_names=nodeLabelNames,
            node_label_default=nodeLabelDefault,
            edge_attribute=edgeAttribute,
        ).fit(rc_graphs, templates, update_template)

        return cluster_indices, templates

    def fit(
        self,
        original_reaction_dicts: List[Dict[str, Any]],
        its_column: str = "ITSGraph",
        templates: List[Dict] = None,
        update_template: bool = True,
        root_sample: int = 100,
    ) -> List[Dict[str, Any]]:
        """
        Fit the hierarchical clustering model to the data.

        Parameters:
            original_reaction_dicts (List[Dict[str, Any]]): List of reaction dictionaries.
            its_column (str): Column name for the ITS graph.

        Returns:
            List[Dict[str, Any]]: Updated reaction dictionaries with clustering information.
        """
        try:
            reaction_dicts = copy.deepcopy(original_reaction_dicts)
            its_graphs = [value[its_column] for value in reaction_dicts]
            if templates:
                logging.info("Processing with templates")
                results = [
                    self.process_level(
                        its_graphs,
                        k,
                        self.nodeLabelNames,
                        self.nodeLabelDefault,
                        self.edgeAttribute,
                        templates,
                        update_template,
                    )
                    for k in self.radius
                ]

                cluster_indices = [value[0] for value in results]
                templates = [value[1] for value in results]
            else:
                logging.info("Processing without templates")
                root_length = min(root_sample, len(its_graphs))

                its_root = its_graphs[:root_length]
                its_left = its_graphs[root_length:]

                logging.info(f"Processing {root_length} data to get templates")
                results = [
                    self.process_level(
                        its_root,
                        k,
                        self.nodeLabelNames,
                        self.nodeLabelDefault,
                        self.edgeAttribute,
                        None,
                        update_template,
                    )
                    for k in self.radius
                ]

                cluster_indices_root = [value[0] for value in results]
                templates_root = [value[1] for value in results]

                logging.info("Processing other data with new templates")
                results_left = [
                    self.process_level(
                        its_left,
                        k,
                        self.nodeLabelNames,
                        self.nodeLabelDefault,
                        self.edgeAttribute,
                        templates_root[k],
                        update_template,
                    )
                    for k in self.radius
                ]
                cluster_indices_left = [value[0] for value in results_left]
                templates = [value[1] for value in results_left]

                cluster_indices = [
                    cluster_indices_root[key] + cluster_indices_left[key]
                    for key, _ in enumerate(cluster_indices_root)
                ]

            cluster_df = pd.DataFrame(
                {f"Cluster_R{k}": idx for k, idx in zip(self.radius, cluster_indices)}
            ).to_dict("records")

            for key, value in enumerate(reaction_dicts):
                value.update(cluster_df[key])
                value["Reaction Type"] = check_graph_type(value["GraphRules"][2])
                value["Rings"] = get_cycle_member_rings(value["GraphRules"][2])

            return reaction_dicts, templates

        except Exception as e:
            print(f"An error occurred: {e}")


In [10]:
its_graphs = [value['ITSGraph'] for value in data]

In [None]:
node_label_names = ["element", "charge"]
hcl = HierarchicalClustering(node_label_names=node_label_names,
                            node_label_default=["*", 0],
                            edge_attribute="order",
                            max_radius=1)

In [82]:
cluster_indices_0, temp_0 = hcl.process_level(its_graphs[:1000],0,nodeLabelNames=node_label_names,
                            nodeLabelDefault=["*", 0],
                            edgeAttribute="order", templates=None, update_template=True)
print(len(temp_0))

2024-05-31 15:42:30,799 - INFO - Processing templates with 0:


105


In [83]:
cluster_indices_1, temp_1 = hcl.process_level(its_graphs[:1000],1,nodeLabelNames=node_label_names,
                            nodeLabelDefault=["*", 0],
                            edgeAttribute="order", templates=None, update_template=True)

print(len(temp_1))

2024-05-31 15:42:48,425 - INFO - Processing templates with 1:


228


In [108]:
from typing import List, Dict, Any

def split_graphs_by_class(graphs: List[Any], class_labels: List[int]) -> Dict[int, List[Any]]:
    """
    Splits a list of graphs into separate lists based on the provided class labels.

    Args:
        graphs (List[Any]): The list of graphs to be split.
        class_labels (List[int]): The list containing class labels corresponding to each graph.

    Returns:
        Dict[int, List[Any]]: A dictionary where keys are class labels and values are lists of graphs
              corresponding to each class.
    """
    if len(graphs) != len(class_labels):
        raise ValueError("The length of 'graphs' and 'class_labels' must be the same.")

    class_dict = {}
    for label, graph in zip(class_labels, graphs):
        if label not in class_dict:
            class_dict[label] = []
        class_dict[label].append(graph)

 


    return class_dict


len(split_graphs_by_class(its_graphs[:1000], cluster_indices_0))

105

In [109]:
from typing import List, Dict, Any, Tuple

def split_graphs_by_class_and_indices(graphs: List[Any], class_labels: List[int]) -> Tuple[Dict[int, List[Any]], Dict[int, List[int]]]:
    """
    Splits a list of graphs and their indices into separate lists based on the provided class labels.

    Args:
        graphs (List[Any]): The list of graphs to be split.
        class_labels (List[int]): The list containing class labels corresponding to each graph.

    Returns:
        Tuple[Dict[int, List[Any]], Dict[int, List[int]]]: A tuple containing two dictionaries:
            1. A dictionary where keys are class labels and values are lists of graphs corresponding to each class.
            2. A dictionary where keys are class labels and values are lists of indices corresponding to each class.
    """
    if len(graphs) != len(class_labels):
        raise ValueError("The length of 'graphs' and 'class_labels' must be the same.")

    class_dict = {}
    index_dict = {}
    for index, (label, graph) in enumerate(zip(class_labels, graphs)):
        if label not in class_dict:
            class_dict[label] = []
            index_dict[label] = []
        class_dict[label].append(graph)
        index_dict[label].append(index)

    return class_dict, index_dict

graph_dict, index_dict = split_graphs_by_class_and_indices(its_graphs[:1000], cluster_indices_0)

In [111]:
len(graph_dict)

105

In [115]:
len(graph_dict)

105

In [135]:
templates = []
cluster_indice_all = ['a' for i in range(len(its_graphs[:1000]))]
for key, value in graph_dict.items():
    print(len(templates))
    cluster_indices_batch, new_templates = hcl.process_level(value,1,nodeLabelNames=node_label_names,
                            nodeLabelDefault=["*", 0],
                            edgeAttribute="order", templates=templates, update_template=True)
    key_index = index_dict[key]

    for i, j in enumerate(key_index):
        cluster_indice_all[j] = cluster_indices_batch[i]
    

2024-05-31 16:09:41,834 - INFO - Processing templates with 1:


2024-05-31 16:09:41,874 - INFO - Processing templates with 1:
2024-05-31 16:09:41,899 - INFO - Processing templates with 1:
2024-05-31 16:09:41,988 - INFO - Processing templates with 1:
2024-05-31 16:09:42,013 - INFO - Processing templates with 1:
2024-05-31 16:09:42,024 - INFO - Processing templates with 1:
2024-05-31 16:09:42,026 - INFO - Processing templates with 1:
2024-05-31 16:09:42,031 - INFO - Processing templates with 1:


0
7
11
11
11
13
15
15


2024-05-31 16:09:42,077 - INFO - Processing templates with 1:
2024-05-31 16:09:42,090 - INFO - Processing templates with 1:
2024-05-31 16:09:42,123 - INFO - Processing templates with 1:
2024-05-31 16:09:42,134 - INFO - Processing templates with 1:
2024-05-31 16:09:42,137 - INFO - Processing templates with 1:
2024-05-31 16:09:42,164 - INFO - Processing templates with 1:
2024-05-31 16:09:42,168 - INFO - Processing templates with 1:
2024-05-31 16:09:42,178 - INFO - Processing templates with 1:
2024-05-31 16:09:42,215 - INFO - Processing templates with 1:
2024-05-31 16:09:42,231 - INFO - Processing templates with 1:
2024-05-31 16:09:42,232 - INFO - Processing templates with 1:
2024-05-31 16:09:42,268 - INFO - Processing templates with 1:
2024-05-31 16:09:42,273 - INFO - Processing templates with 1:
2024-05-31 16:09:42,274 - INFO - Processing templates with 1:


15
17
17
17
19
24
24
24
24
24
24
24
24
24


2024-05-31 16:09:42,281 - INFO - Processing templates with 1:
2024-05-31 16:09:42,285 - INFO - Processing templates with 1:
2024-05-31 16:09:42,307 - INFO - Processing templates with 1:
2024-05-31 16:09:42,313 - INFO - Processing templates with 1:
2024-05-31 16:09:42,317 - INFO - Processing templates with 1:
2024-05-31 16:09:42,322 - INFO - Processing templates with 1:
2024-05-31 16:09:42,336 - INFO - Processing templates with 1:
2024-05-31 16:09:42,337 - INFO - Processing templates with 1:
2024-05-31 16:09:42,338 - INFO - Processing templates with 1:
2024-05-31 16:09:42,342 - INFO - Processing templates with 1:
2024-05-31 16:09:42,351 - INFO - Processing templates with 1:
2024-05-31 16:09:42,357 - INFO - Processing templates with 1:
2024-05-31 16:09:42,359 - INFO - Processing templates with 1:
2024-05-31 16:09:42,361 - INFO - Processing templates with 1:
2024-05-31 16:09:42,364 - INFO - Processing templates with 1:
2024-05-31 16:09:42,379 - INFO - Processing templates with 1:
2024-05-

27
27
27
28
28
33
34
35
35
35
35
35
35
35
36
36
36
36
37
38
38
38
39
39
40
40
41
41
41
41
41
41
43
43
43
44
44
44
44
46
46
47
47
47
47


2024-05-31 16:09:42,482 - INFO - Processing templates with 1:
2024-05-31 16:09:42,487 - INFO - Processing templates with 1:
2024-05-31 16:09:42,492 - INFO - Processing templates with 1:
2024-05-31 16:09:42,493 - INFO - Processing templates with 1:
2024-05-31 16:09:42,496 - INFO - Processing templates with 1:
2024-05-31 16:09:42,499 - INFO - Processing templates with 1:
2024-05-31 16:09:42,502 - INFO - Processing templates with 1:
2024-05-31 16:09:42,503 - INFO - Processing templates with 1:
2024-05-31 16:09:42,504 - INFO - Processing templates with 1:
2024-05-31 16:09:42,506 - INFO - Processing templates with 1:
2024-05-31 16:09:42,507 - INFO - Processing templates with 1:
2024-05-31 16:09:42,508 - INFO - Processing templates with 1:
2024-05-31 16:09:42,510 - INFO - Processing templates with 1:
2024-05-31 16:09:42,512 - INFO - Processing templates with 1:
2024-05-31 16:09:42,514 - INFO - Processing templates with 1:
2024-05-31 16:09:42,516 - INFO - Processing templates with 1:
2024-05-

49
49
49
50
50
52
52
52
52
52
52
52
54
54
54
55
55
56
56
56
57
57
58
59
59
59
61
62
62
63
64
64
64
64
64
65
65
66


In [132]:
pd.concat([pd.DataFrame(cluster_indices_0), pd.DataFrame(cluster_indice_all)], axis=1)

Unnamed: 0,0,0.1
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
995,2,0
996,2,0
997,66,1
998,30,1


In [127]:
cluster_indice_all

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 2,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 3,
 4,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 2,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 5,
 1,
 0,
 6,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 2,
 7,
 0,
 0,
 0,
 2,
 3,
 0,
 1,
 2,
 1,
 0,
 3,
 0,
 0,
 0,
 2,
 0,
 2,
 2,
 1,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 2,
 0,
 1,
 1,
 4,
 0,
 1,
 0,
 0,
 0,
 6,
 2,
 0,
 0,
 0,
 5,
 0,
 8,
 3,
 2,
 3,
 0,
 0,
 3,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 1,
 3,
 0,
 0,
 0,
 0,
 0,
 9,
 0,
 0,
 4,
 0,
 0,
 2,
 0,
 0,
 9,
 0,
 6,
 0,
 0,
 2,
 7,
 3,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 1,
 1,
 0,
 0,
 2,
 3,
 1,
 8,
 0,
 0,
 1,
 1,
 1,
 2,
 3,
 0,
 0,
 0,
 3,
 0,
 0,
 9,
 1,
 1,
 2,
 0,
 1,
 3,
 2,
 0,
 10,
 1,
 0,
 0,
 0,
 2,
 4,
 5,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 8,
 0,
 7,
 0,
 0,
 3,
 2,
 1,
 0,
 2,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 10,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 11,
 6,
 1,
 1,
 0,
 0,
 1,
 

In [123]:
templates

[]

In [114]:
len(templates)

67

In [71]:
cluster_indices_1[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 0, 8]

In [73]:
cluster_indice_all[:10]

[0, 6, 0, 2, 9, 10, 6, 0, 0, 11]

In [76]:
pd.DataFrame(cluster_indices_1).value_counts()

0  
2      83
10     47
0      34
40     34
21     27
       ..
126     1
125     1
13      1
122     1
227     1
Name: count, Length: 228, dtype: int64

In [75]:
pd.DataFrame(cluster_indice_all).value_counts()

0 
0     375
2     201
4      80
7      52
23     37
13     33
6      31
9      30
17     26
15     18
11     13
19     13
8      12
10     11
18     10
20      8
28      7
29      7
1       5
3       5
24      5
25      5
12      4
26      4
5       2
27      2
16      1
14      1
21      1
22      1
Name: count, dtype: int64

In [4]:
#from SynTemp.SynRule.hierarchical_clustering import HierarchicalClustering
node_label_names = ["element", "charge"]
hcl = HierarchicalClustering(node_label_names=node_label_names,
                            node_label_default=["*", 0],
                            edge_attribute="order",
                            max_radius=1)

reaction_dicts, templates = hcl.fit(data[:])

2024-05-31 15:06:54,193 - INFO - Processing without templates
2024-05-31 15:06:54,194 - INFO - Processing 100 data to get templates
2024-05-31 15:06:54,195 - INFO - Processing templates with 0:
2024-05-31 15:06:54,265 - INFO - Processing templates with 1:
2024-05-31 15:06:54,412 - INFO - Processing other data with new templates
2024-05-31 15:06:54,413 - INFO - Processing templates with 0:


Processing templates with 0:
Processing templates with 1:
Processing templates with 0:


2024-05-31 15:07:19,944 - INFO - Processing templates with 1:


Processing templates with 1:


In [9]:
node_label_names = ["element", "charge"]
hcl = HierarchicalClustering(node_label_names=node_label_names,
                            node_label_default=["*", 0],
                            edge_attribute="order",
                            max_radius=1)

reaction_dicts, templates = hcl.fit(data[:], root_sample=1000)

2024-05-31 15:16:17,696 - INFO - Processing without templates
2024-05-31 15:16:17,698 - INFO - Processing 1000 data to get templates
2024-05-31 15:16:17,698 - INFO - Processing templates with 0:
2024-05-31 15:16:18,496 - INFO - Processing templates with 1:
2024-05-31 15:16:20,842 - INFO - Processing other data with new templates
2024-05-31 15:16:20,843 - INFO - Processing templates with 0:
2024-05-31 15:16:31,918 - INFO - Processing templates with 1:


In [7]:
pd.DataFrame(reaction_dicts)

Unnamed: 0,R-id,ITSGraph,GraphRules,Cluster_R0,Cluster_R1,Reaction Type,Rings
0,48358,"((1, 2, 3, 20, 18, 17, 19, 4, 5, 6, 7, 8, 9, 1...","((4, 3, 20, 21), (4, 3, 20, 21), (4, 3, 20, 21))",0,0,Single Cyclic,[4]
1,27953,"((1, 2, 29, 28, 26, 25, 27, 32, 33, 34, 31, 24...","((2, 3, 29, 30), (2, 3, 29, 30), (2, 3, 29, 30))",1,1,Single Cyclic,[4]
2,46325,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 40, 41, 24, 2...","((43, 10, 11, 42), (43, 10, 11, 42), (43, 10, ...",2,2,Single Cyclic,[4]
3,39695,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","((40, 18, 19, 39), (40, 18, 19, 39), (40, 18, ...",3,3,Single Cyclic,[4]
4,49705,"((13, 14, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","((1, 2, 14), (1, 2, 14), (1, 2, 14))",4,4,Acyclic,[]
...,...,...,...,...,...,...,...
34580,500,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","((24, 29, 30, 23), (24, 29, 30, 23), (24, 29, ...",0,0,Single Cyclic,[4]
34581,25189,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 21, 22, 23, 1...","((24, 10, 11, 23), (24, 10, 11, 23), (24, 10, ...",0,16,Single Cyclic,[4]
34582,36852,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","((26, 43, 42, 28), (26, 43, 42, 28), (26, 43, ...",0,0,Single Cyclic,[4]
34583,21990,"((17, 16, 18, 19, 15, 13, 14, 12, 11, 9, 10, 3...","((8, 9, 37, 38), (8, 9, 37, 38), (8, 9, 37, 38))",0,0,Single Cyclic,[4]
