In [1]:
from SynTemp.SynUtils.utils import load_from_pickle
data = load_from_pickle('./Data/test..pkl.gz')

In [34]:
from typing import List, Any, Dict, Tuple
import pandas as pd
import copy
from joblib import Parallel, delayed
from SynTemp.SynRule.rules_extraction import RuleExtraction
from SynTemp.SynRule.rule_cluster import RuleCluster
from SynTemp.SynUtils.graph_utils import check_graph_type, get_cycle_member_rings
import logging

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)


class HierarchicalClustering(RuleCluster):
    def __init__(
        self,
        node_label_names: List[str] = [
            "element",
            "aromatic",
            "hcount",
            "charge",
            "typesGH",
        ],
        node_label_default: List[Any] = ["*", False, 0, 0, ()],
        edge_attribute: str = "order",
        max_radius: int = 3,
    ):
        """
        Initializes the HierarchicalClustering with customization options for node and edge matching functions.

        Parameters:
            node_label_names (List[str]): Node attribute names for matching.
            node_label_default (List[Any]): Default values for node attributes.
            edge_attribute (str): Edge attribute name for matching.
            max_radius (int): Maximum number of hierarchical levels.
        """
        super().__init__()
        self.radius = list(range(max_radius + 1))
        self.nodeLabelDefault = node_label_default
        self.nodeLabelNames = node_label_names
        self.edgeAttribute = edge_attribute
    @staticmethod
    def split_graphs_by_class_and_indices(graphs: List[Any], class_labels: List[int]) -> Tuple[Dict[int, List[Any]], Dict[int, List[int]]]:
        """
        Splits a list of graphs and their indices into separate lists based on the provided class labels.

        Args:
            graphs (List[Any]): The list of graphs to be split.
            class_labels (List[int]): The list containing class labels corresponding to each graph.

        Returns:
            Tuple[Dict[int, List[Any]], Dict[int, List[int]]]: A tuple containing two dictionaries:
                1. A dictionary where keys are class labels and values are lists of graphs corresponding to each class.
                2. A dictionary where keys are class labels and values are lists of indices corresponding to each class.
        """
        if len(graphs) != len(class_labels):
            raise ValueError("The length of 'graphs' and 'class_labels' must be the same.")

        class_dict = {}
        index_dict = {}
        for index, (label, graph) in enumerate(zip(class_labels, graphs)):
            if label not in class_dict:
                class_dict[label] = []
                index_dict[label] = []
            class_dict[label].append(graph)
            index_dict[label].append(index)

        return class_dict, index_dict

    @staticmethod
    def process_level(
            its_graphs: List[Any],
            k: int,
            nodeLabelNames: List[str],
            nodeLabelDefault: Any,
            edgeAttribute: str,
            templates: Dict,
            update_template: bool,
        ) -> Tuple[Dict, Dict]:
            """
            Processes a level in a graph by extracting rules from the input graphs and clustering them.

            Args:
                its_graphs (List[Any]): A list of input graphs to process.
                k (int): The number of nearest neighbors to consider in the k-NN algorithm for rule extraction.
                nodeLabelNames (List[str]): A list of labels for the nodes in the graph.
                nodeCountDefault (Any): The default value for node labels if no label is specified.
                edgeAttribute (str): The attribute name of the edges used in rule extraction.
                templates (Dict): A dictionary of templates used for clustering rules.
                update_template (bool): A flag to determine whether to update the templates after clustering.

            Returns:
                Tuple[Dict, Dict]: A tuple containing the mapping of graphs to clusters and the potentially updated templates.
            """
            logging.info(f"Processing templates with {k}:")
            rc_graphs = [
                RuleExtraction.extract_reaction_rules(*value, extend=True, n_knn=k)[2]
                for value in its_graphs
            ]

            # Fit the rule clusters with the extracted graphs and templates
            cluster_indices, templates = RuleCluster(
                node_label_names=nodeLabelNames,
                node_label_default=nodeLabelDefault,
                edge_attribute=edgeAttribute,
            ).fit(rc_graphs, templates, update_template)

            return cluster_indices, templates
        
    
    def process_child_level(self, parent_graphs, parent_cluster_indices, node_label_names, 
                            radius=1, nodeLabelDefault=["*", 0], edgeAttribute="order",
                            templates = None, update_template = False):
        """
        Process graphs by clusters, updating templates and indices based on the specified node label names.

        Args:
        - graphs (list): A list of graph structures to be processed.
        - indices (list): A list of indices representing cluster identifications.
        - node_label_names (list): A list of node label names used in the graph processing.

        Returns:
        - tuple:
            - templates (list): A list of template dictionaries generated during processing.
            - cluster_indices_all (list): Updated list of all cluster indices after processing.
        """
        graph_dict, index_dict = self.split_graphs_by_class_and_indices(parent_graphs, parent_cluster_indices)
        templates = []
        cluster_indices_all = ['a'] * len(parent_graphs)
        max_index_template = 0

        for key, value in graph_dict.items():
            cluster_indices_batch, new_templates = self.process_level(
                value, radius, nodeLabelNames=node_label_names,
                nodeLabelDefault=nodeLabelDefault, edgeAttribute=edgeAttribute, 
                templates=None, update_template=update_template
            )
            
            cluster_indices_batch = [i + max_index_template for i in cluster_indices_batch]
            new_templates = [{'Cluster_id': template['Cluster_id'] + max_index_template, 'RC': template['RC'], 'Parent': key} for template in new_templates]
            max_index_template += len(new_templates)
            templates.extend(new_templates)
            
            for i, j in enumerate(index_dict[key]):
                cluster_indices_all[j] = cluster_indices_batch[i]

        return cluster_indices_all, templates

    def fit(
        self,
        original_reaction_dicts: List[Dict[str, Any]],
        its_column: str = "ITSGraph",
        templates: List[Dict] = None,
        update_template: bool = True,
        root_sample: int = 100,
    ) -> List[Dict[str, Any]]:
        """
        Fit the hierarchical clustering model to the data.

        Parameters:
            original_reaction_dicts (List[Dict[str, Any]]): List of reaction dictionaries.
            its_column (str): Column name for the ITS graph.

        Returns:
            List[Dict[str, Any]]: Updated reaction dictionaries with clustering information.
        """
        try:
            reaction_dicts = copy.deepcopy(original_reaction_dicts)
            its_graphs = [value[its_column] for value in reaction_dicts]
            # if templates:
            
            logging.info("Processing with templates")
            logging.info("Parent level")
            cluster_indices_0, templates_0 = self.process_level(
                its_graphs, 0, self.nodeLabelNames, self.nodeLabelDefault, self.edgeAttribute, templates, update_template
            )
            logging.info(len(templates_0))
            print(templates_0)
            cluster_indices = [cluster_indices_0]
            templates = [templates_0]
            print(len(templates))
            parent_cluster_indices = cluster_indices_0
            for k in self.radius: 
                if k > 0:
                    logging.info(f"Child level with radius {k}")
                    cluster_indices_k, templates_k = self.process_child_level(
                        its_graphs,
                        parent_cluster_indices,
                        self.nodeLabelNames,
                        k,
                        self.nodeLabelDefault,
                        self.edgeAttribute,
                        None,
                        update_template
                    )
                    print(len(templates_k))
                    cluster_indices.append(cluster_indices_k)
                    templates.append(templates_k)
                    print(len(templates))
                    parent_cluster_indices = cluster_indices_k
            

            #cluster_indices = cluster_indices.append([value[0] for value in results])
            #templates = cluster_indices.append([value[1] for value in results])
            # else:
            #     logging.info("Processing without templates")
            #     root_length = min(root_sample, len(its_graphs))

            #     its_root = its_graphs[:root_length]
            #     its_left = its_graphs[root_length:]

            #     logging.info(f"Processing {root_length} data to get templates")
            #     results = [
            #         self.process_level(
            #             its_root,
            #             k,
            #             self.nodeLabelNames,
            #             self.nodeLabelDefault,
            #             self.edgeAttribute,
            #             None,
            #             update_template,
            #         )
            #         for k in self.radius
            #     ]

            #     cluster_indices_root = [value[0] for value in results]
            #     templates_root = [value[1] for value in results]

            #     logging.info("Processing other data with new templates")
            #     results_left = [
            #         self.process_level(
            #             its_left,
            #             k,
            #             self.nodeLabelNames,
            #             self.nodeLabelDefault,
            #             self.edgeAttribute,
            #             templates_root[k],
            #             update_template,
            #         )
            #         for k in self.radius
            #     ]
            #     cluster_indices_left = [value[0] for value in results_left]
            #     templates = [value[1] for value in results_left]

            #     cluster_indices = [
            #         cluster_indices_root[key] + cluster_indices_left[key]
            #         for key, _ in enumerate(cluster_indices_root)
            #     ]

            cluster_df = pd.DataFrame(
                {f"Cluster_R{k}": idx for k, idx in zip(self.radius, cluster_indices)}
            ).to_dict("records")

            for key, value in enumerate(reaction_dicts):
                value.update(cluster_df[key])
                value["Reaction Type"] = check_graph_type(value["GraphRules"][2])
                value["Rings"] = get_cycle_member_rings(value["GraphRules"][2])

            return reaction_dicts, templates

        except Exception as e:
            print(f"An error occurred: {e}")


In [4]:
from SynTemp.SynRule.hierarchical_clustering import HierarchicalClustering
node_label_names = ["element", "charge"]
hcl = HierarchicalClustering(node_label_names=node_label_names,
                            node_label_default=["*", 0],
                            edge_attribute="order",
                            max_radius=3)

In [5]:
reaction_dicts, templates = hcl.fit(data, 'ITSGraph', templates=None, update_template=True, root_sample=100)

2024-06-01 09:15:12,461 - INFO - Processing with templates
2024-06-01 09:15:12,461 - INFO - Parent level
2024-06-01 09:15:12,461 - INFO - Processing templates with 0:
2024-06-01 09:15:12,843 - INFO - 105
2024-06-01 09:15:12,844 - INFO - Child level with radius 1
2024-06-01 09:15:12,844 - INFO - Processing templates with 1:
2024-06-01 09:15:12,866 - INFO - Processing templates with 1:
2024-06-01 09:15:12,875 - INFO - Processing templates with 1:
2024-06-01 09:15:12,923 - INFO - Processing templates with 1:
2024-06-01 09:15:12,931 - INFO - Processing templates with 1:
2024-06-01 09:15:12,935 - INFO - Processing templates with 1:
2024-06-01 09:15:12,936 - INFO - Processing templates with 1:
2024-06-01 09:15:12,938 - INFO - Processing templates with 1:
2024-06-01 09:15:12,984 - INFO - Processing templates with 1:
2024-06-01 09:15:12,987 - INFO - Processing templates with 1:
2024-06-01 09:15:12,999 - INFO - Processing templates with 1:
2024-06-01 09:15:13,005 - INFO - Processing templates w

[{'Cluster_id': 0, 'RC': <networkx.classes.graph.Graph object at 0x13242f7d0>, 'Parent': []}, {'Cluster_id': 1, 'RC': <networkx.classes.graph.Graph object at 0x1775ba890>, 'Parent': []}, {'Cluster_id': 2, 'RC': <networkx.classes.graph.Graph object at 0x177619bd0>, 'Parent': []}, {'Cluster_id': 3, 'RC': <networkx.classes.graph.Graph object at 0x177669f90>, 'Parent': []}, {'Cluster_id': 4, 'RC': <networkx.classes.graph.Graph object at 0x1775fdad0>, 'Parent': []}, {'Cluster_id': 5, 'RC': <networkx.classes.graph.Graph object at 0x17756fe90>, 'Parent': []}, {'Cluster_id': 6, 'RC': <networkx.classes.graph.Graph object at 0x168fa0090>, 'Parent': []}, {'Cluster_id': 7, 'RC': <networkx.classes.graph.Graph object at 0x1775cde10>, 'Parent': []}, {'Cluster_id': 8, 'RC': <networkx.classes.graph.Graph object at 0x137644c90>, 'Parent': []}, {'Cluster_id': 9, 'RC': <networkx.classes.graph.Graph object at 0x1376353d0>, 'Parent': []}, {'Cluster_id': 10, 'RC': <networkx.classes.graph.Graph object at 0x13

2024-06-01 09:15:13,050 - INFO - Processing templates with 1:
2024-06-01 09:15:13,066 - INFO - Processing templates with 1:
2024-06-01 09:15:13,071 - INFO - Processing templates with 1:
2024-06-01 09:15:13,071 - INFO - Processing templates with 1:
2024-06-01 09:15:13,073 - INFO - Processing templates with 1:
2024-06-01 09:15:13,074 - INFO - Processing templates with 1:
2024-06-01 09:15:13,083 - INFO - Processing templates with 1:
2024-06-01 09:15:13,085 - INFO - Processing templates with 1:
2024-06-01 09:15:13,087 - INFO - Processing templates with 1:
2024-06-01 09:15:13,089 - INFO - Processing templates with 1:
2024-06-01 09:15:13,093 - INFO - Processing templates with 1:
2024-06-01 09:15:13,093 - INFO - Processing templates with 1:
2024-06-01 09:15:13,094 - INFO - Processing templates with 1:
2024-06-01 09:15:13,096 - INFO - Processing templates with 1:
2024-06-01 09:15:13,101 - INFO - Processing templates with 1:
2024-06-01 09:15:13,104 - INFO - Processing templates with 1:
2024-06-

278
2


2024-06-01 09:15:13,394 - INFO - Processing templates with 2:
2024-06-01 09:15:13,403 - INFO - Processing templates with 2:
2024-06-01 09:15:13,408 - INFO - Processing templates with 2:
2024-06-01 09:15:13,411 - INFO - Processing templates with 2:
2024-06-01 09:15:13,416 - INFO - Processing templates with 2:
2024-06-01 09:15:13,420 - INFO - Processing templates with 2:
2024-06-01 09:15:13,426 - INFO - Processing templates with 2:
2024-06-01 09:15:13,428 - INFO - Processing templates with 2:
2024-06-01 09:15:13,433 - INFO - Processing templates with 2:
2024-06-01 09:15:13,435 - INFO - Processing templates with 2:
2024-06-01 09:15:13,435 - INFO - Processing templates with 2:
2024-06-01 09:15:13,436 - INFO - Processing templates with 2:
2024-06-01 09:15:13,436 - INFO - Processing templates with 2:
2024-06-01 09:15:13,439 - INFO - Processing templates with 2:
2024-06-01 09:15:13,440 - INFO - Processing templates with 2:
2024-06-01 09:15:13,441 - INFO - Processing templates with 2:
2024-06-

723
3


2024-06-01 09:15:13,926 - INFO - Processing templates with 3:
2024-06-01 09:15:13,927 - INFO - Processing templates with 3:
2024-06-01 09:15:13,927 - INFO - Processing templates with 3:
2024-06-01 09:15:13,928 - INFO - Processing templates with 3:
2024-06-01 09:15:13,928 - INFO - Processing templates with 3:
2024-06-01 09:15:13,929 - INFO - Processing templates with 3:
2024-06-01 09:15:13,930 - INFO - Processing templates with 3:
2024-06-01 09:15:13,931 - INFO - Processing templates with 3:
2024-06-01 09:15:13,932 - INFO - Processing templates with 3:
2024-06-01 09:15:13,932 - INFO - Processing templates with 3:
2024-06-01 09:15:13,936 - INFO - Processing templates with 3:
2024-06-01 09:15:13,937 - INFO - Processing templates with 3:
2024-06-01 09:15:13,938 - INFO - Processing templates with 3:
2024-06-01 09:15:13,942 - INFO - Processing templates with 3:
2024-06-01 09:15:13,943 - INFO - Processing templates with 3:
2024-06-01 09:15:13,944 - INFO - Processing templates with 3:
2024-06-

942
4


In [8]:
import pandas as pd
pd.DataFrame(reaction_dicts)

Unnamed: 0,R-id,ITSGraph,GraphRules,Cluster_R0,Cluster_R1,Cluster_R2,Cluster_R3,Reaction Type,Rings
0,48358,"((1, 2, 3, 20, 18, 17, 19, 4, 5, 6, 7, 8, 9, 1...","((4, 3, 20, 21), (4, 3, 20, 21), (4, 3, 20, 21))",0,0,0,0,Single Cyclic,[4]
1,27953,"((1, 2, 29, 28, 26, 25, 27, 32, 33, 34, 31, 24...","((2, 3, 29, 30), (2, 3, 29, 30), (2, 3, 29, 30))",1,16,14,1,Single Cyclic,[4]
2,46325,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 40, 41, 24, 2...","((43, 10, 11, 42), (43, 10, 11, 42), (43, 10, ...",2,23,15,2,Single Cyclic,[4]
3,39695,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","((40, 18, 19, 39), (40, 18, 19, 39), (40, 18, ...",3,48,54,4,Single Cyclic,[4]
4,49705,"((13, 14, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","((1, 2, 14), (1, 2, 14), (1, 2, 14))",4,52,64,5,Acyclic,[]
...,...,...,...,...,...,...,...,...,...
995,17415,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 32, 2...","((32, 33, 11, 13), (32, 33, 11, 13), (32, 33, ...",2,23,36,486,Single Cyclic,[4]
996,34847,"((1, 2, 3, 4, 5, 6, 35, 8, 7, 36, 9, 10, 11, 1...","((36, 37, 6, 7), (36, 37, 6, 7), (36, 37, 6, 7))",2,23,53,939,Single Cyclic,[4]
997,39409,"((6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, ...","((2, 5, 6, 22, 23, 24, 25), (2, 5, 6, 22, 23, ...",66,232,722,940,Complex Cyclic,"[4, 4]"
998,2387,"((20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 1, 2...","((32, 33, 19, 20), (32, 33, 19, 20), (32, 33, ...",30,165,411,221,Single Cyclic,[4]


In [21]:
a= []
a.append([1,2,3])
a

[[1, 2, 3]]

In [27]:
len(templates[1])

278

In [51]:
templates

[{'Cluster_id': 0, 'RC': <networkx.classes.graph.Graph at 0x3ffd79110>},
 {'Cluster_id': 1, 'RC': <networkx.classes.graph.Graph at 0x3ffd79310>},
 {'Cluster_id': 2, 'RC': <networkx.classes.graph.Graph at 0x3ffd44e10>},
 {'Cluster_id': 3, 'RC': <networkx.classes.graph.Graph at 0x3f96c0050>},
 {'Cluster_id': 4, 'RC': <networkx.classes.graph.Graph at 0x3ffd47210>},
 {'Cluster_id': 5, 'RC': <networkx.classes.graph.Graph at 0x3f5644150>},
 {'Cluster_id': 6, 'RC': <networkx.classes.graph.Graph at 0x3ffd44d10>},
 {'Cluster_id': 7, 'RC': <networkx.classes.graph.Graph at 0x3ffd25e90>},
 {'Cluster_id': 8, 'RC': <networkx.classes.graph.Graph at 0x16b0e8250>},
 {'Cluster_id': 9, 'RC': <networkx.classes.graph.Graph at 0x3ffce68d0>},
 {'Cluster_id': 10, 'RC': <networkx.classes.graph.Graph at 0x15d4480d0>},
 {'Cluster_id': 11, 'RC': <networkx.classes.graph.Graph at 0x15d61a110>},
 {'Cluster_id': 12, 'RC': <networkx.classes.graph.Graph at 0x3ffcfa950>},
 {'Cluster_id': 13, 'RC': <networkx.classes.grap

In [43]:
templates

[{'Cluster_id': 0, 'RC': <networkx.classes.graph.Graph at 0x3f0613990>},
 {'Cluster_id': 1, 'RC': <networkx.classes.graph.Graph at 0x3f8bcc6d0>},
 {'Cluster_id': 2, 'RC': <networkx.classes.graph.Graph at 0x3f8bf1a50>},
 {'Cluster_id': 3, 'RC': <networkx.classes.graph.Graph at 0x3f8bfe790>},
 {'Cluster_id': 4, 'RC': <networkx.classes.graph.Graph at 0x3f8b8e850>},
 {'Cluster_id': 5, 'RC': <networkx.classes.graph.Graph at 0x3f8c2c910>},
 {'Cluster_id': 6, 'RC': <networkx.classes.graph.Graph at 0x3f06eb0d0>},
 {'Cluster_id': 7, 'RC': <networkx.classes.graph.Graph at 0x121bcbb10>},
 {'Cluster_id': 8, 'RC': <networkx.classes.graph.Graph at 0x3f8bfd950>},
 {'Cluster_id': 9, 'RC': <networkx.classes.graph.Graph at 0x3f05f5850>},
 {'Cluster_id': 10, 'RC': <networkx.classes.graph.Graph at 0x3f0681c50>},
 {'Cluster_id': 11, 'RC': <networkx.classes.graph.Graph at 0x3f0cc7cd0>},
 {'Cluster_id': 12, 'RC': <networkx.classes.graph.Graph at 0x3f0ca8790>},
 {'Cluster_id': 13, 'RC': <networkx.classes.grap

In [21]:
len(data)

1000

In [5]:
templates

[[{'Cluster_id': 0, 'RC': <networkx.classes.graph.Graph at 0x132490fd0>},
  {'Cluster_id': 1, 'RC': <networkx.classes.graph.Graph at 0x110ff8a90>},
  {'Cluster_id': 2, 'RC': <networkx.classes.graph.Graph at 0x175cec950>},
  {'Cluster_id': 3, 'RC': <networkx.classes.graph.Graph at 0x175ceee10>},
  {'Cluster_id': 4, 'RC': <networkx.classes.graph.Graph at 0x175d35310>},
  {'Cluster_id': 5, 'RC': <networkx.classes.graph.Graph at 0x174d80110>},
  {'Cluster_id': 6, 'RC': <networkx.classes.graph.Graph at 0x175d82590>},
  {'Cluster_id': 7, 'RC': <networkx.classes.graph.Graph at 0x175d826d0>},
  {'Cluster_id': 8, 'RC': <networkx.classes.graph.Graph at 0x175caa450>},
  {'Cluster_id': 9, 'RC': <networkx.classes.graph.Graph at 0x15d9180d0>},
  {'Cluster_id': 10, 'RC': <networkx.classes.graph.Graph at 0x15c186a90>},
  {'Cluster_id': 11, 'RC': <networkx.classes.graph.Graph at 0x15c0e3050>},
  {'Cluster_id': 12, 'RC': <networkx.classes.graph.Graph at 0x15c134690>},
  {'Cluster_id': 13, 'RC': <network

In [8]:
import pandas as pd
data = pd.DataFrame(reaction_dicts)

In [14]:
len(templates[1])

134

In [11]:
data['Cluster_R0'].value_counts()

Cluster_R0
0     716
12     52
4      48
21     39
8      16
2      14
7      10
24      8
15      5
14      5
28      5
9       5
3       5
40      4
42      4
18      4
35      4
16      3
23      3
1       2
39      2
43      2
36      2
45      2
46      2
11      2
25      2
51      2
5       2
19      2
13      2
26      2
41      1
50      1
49      1
48      1
47      1
44      1
52      1
53      1
54      1
33      1
38      1
37      1
34      1
10      1
32      1
31      1
30      1
29      1
27      1
22      1
20      1
17      1
6       1
55      1
Name: count, dtype: int64

In [25]:
its_graphs = [value['ITSGraph'] for value in data]

In [27]:
cluster_indices_0, temp_0 = hcl.process_level(its_graphs[:1000],0,nodeLabelNames=node_label_names,
                            nodeLabelDefault=["*", 0],
                            edgeAttribute="order", templates=None, update_template=True)
print(len(temp_0))

2024-06-01 09:00:28,949 - INFO - Processing templates with 0:


105


In [28]:
cluster_indices_1, temp_1 = hcl.process_level(its_graphs[:1000],1,nodeLabelNames=node_label_names,
                            nodeLabelDefault=["*", 0],
                            edgeAttribute="order", templates=None, update_template=True)

print(len(temp_1))

2024-06-01 09:00:37,518 - INFO - Processing templates with 1:


278


In [29]:
cluster_indices_2, temp_2 = hcl.process_level(its_graphs[:1000],2,nodeLabelNames=node_label_names,
                            nodeLabelDefault=["*", 0],
                            edgeAttribute="order", templates=None, update_template=True)

print(len(temp_1))

2024-06-01 09:00:46,967 - INFO - Processing templates with 2:


278


In [30]:
from typing import List, Dict, Any, Tuple

def split_graphs_by_class_and_indices(graphs: List[Any], class_labels: List[int]) -> Tuple[Dict[int, List[Any]], Dict[int, List[int]]]:
    """
    Splits a list of graphs and their indices into separate lists based on the provided class labels.

    Args:
        graphs (List[Any]): The list of graphs to be split.
        class_labels (List[int]): The list containing class labels corresponding to each graph.

    Returns:
        Tuple[Dict[int, List[Any]], Dict[int, List[int]]]: A tuple containing two dictionaries:
            1. A dictionary where keys are class labels and values are lists of graphs corresponding to each class.
            2. A dictionary where keys are class labels and values are lists of indices corresponding to each class.
    """
    if len(graphs) != len(class_labels):
        raise ValueError("The length of 'graphs' and 'class_labels' must be the same.")

    class_dict = {}
    index_dict = {}
    for index, (label, graph) in enumerate(zip(class_labels, graphs)):
        if label not in class_dict:
            class_dict[label] = []
            index_dict[label] = []
        class_dict[label].append(graph)
        index_dict[label].append(index)

    return class_dict, index_dict

def process_graph_clusters(parent_graphs, parent_cluster_indices, node_label_names, radius=1, nodeLabelDefault=["*", 0], edgeAttribute="order",):
    """
    Process graphs by clusters, updating templates and indices based on the specified node label names.

    Args:
    - graphs (list): A list of graph structures to be processed.
    - indices (list): A list of indices representing cluster identifications.
    - node_label_names (list): A list of node label names used in the graph processing.

    Returns:
    - tuple:
        - templates (list): A list of template dictionaries generated during processing.
        - cluster_indices_all (list): Updated list of all cluster indices after processing.
    """
    graph_dict, index_dict = split_graphs_by_class_and_indices(parent_graphs, parent_cluster_indices)
    templates = []
    cluster_indices_all = ['a'] * len(parent_graphs)
    max_index_template = 0

    for key, value in graph_dict.items():
        #print(key, len(value))
        cluster_indices_batch, new_templates = hcl.process_level(
            value, radius, nodeLabelNames=node_label_names,
            nodeLabelDefault=nodeLabelDefault, edgeAttribute=edgeAttribute, templates=None, update_template=True
        )
        
        cluster_indices_batch = [i + max_index_template for i in cluster_indices_batch]
        new_templates = [{'Cluster_id': template['Cluster_id'] + max_index_template, 'RC': template['RC'], 'Parent': key} for template in new_templates]
        max_index_template += len(new_templates)
        templates.extend(new_templates)
        
        for i, j in enumerate(index_dict[key]):
            cluster_indices_all[j] = cluster_indices_batch[i]

    return cluster_indices_all, templates

In [31]:
cluster_indices_1, templates_1 = process_graph_clusters(its_graphs, cluster_indices_0, node_label_names, radius=1)

2024-06-01 09:00:58,256 - INFO - Processing templates with 1:
2024-06-01 09:00:58,291 - INFO - Processing templates with 1:
2024-06-01 09:00:58,301 - INFO - Processing templates with 1:
2024-06-01 09:00:58,350 - INFO - Processing templates with 1:
2024-06-01 09:00:58,359 - INFO - Processing templates with 1:
2024-06-01 09:00:58,363 - INFO - Processing templates with 1:
2024-06-01 09:00:58,364 - INFO - Processing templates with 1:
2024-06-01 09:00:58,366 - INFO - Processing templates with 1:
2024-06-01 09:00:58,411 - INFO - Processing templates with 1:
2024-06-01 09:00:58,414 - INFO - Processing templates with 1:
2024-06-01 09:00:58,425 - INFO - Processing templates with 1:
2024-06-01 09:00:58,429 - INFO - Processing templates with 1:
2024-06-01 09:00:58,430 - INFO - Processing templates with 1:
2024-06-01 09:00:58,441 - INFO - Processing templates with 1:
2024-06-01 09:00:58,442 - INFO - Processing templates with 1:
2024-06-01 09:00:58,448 - INFO - Processing templates with 1:
2024-06-

In [37]:
def process_child_level(parent_graphs, parent_cluster_indices, node_label_names, 
                            radius=1, nodeLabelDefault=["*", 0], edgeAttribute="order",
                            templates = None, update_template = False):
        """
        Process graphs by clusters, updating templates and indices based on the specified node label names.

        Args:
        - graphs (list): A list of graph structures to be processed.
        - indices (list): A list of indices representing cluster identifications.
        - node_label_names (list): A list of node label names used in the graph processing.

        Returns:
        - tuple:
            - templates (list): A list of template dictionaries generated during processing.
            - cluster_indices_all (list): Updated list of all cluster indices after processing.
        """
        graph_dict, index_dict = HierarchicalClustering.split_graphs_by_class_and_indices(parent_graphs, parent_cluster_indices)
        templates = []
        cluster_indices_all = ['a'] * len(parent_graphs)
        max_index_template = 0

        for key, value in graph_dict.items():
            cluster_indices_batch, new_templates = HierarchicalClustering.process_level(
                value, radius, nodeLabelNames=node_label_names,
                nodeLabelDefault=nodeLabelDefault, edgeAttribute=edgeAttribute, 
                templates=None, update_template=update_template
            )
            print(cluster_indices_batch)
            
            cluster_indices_batch = [i + max_index_template for i in cluster_indices_batch]
            new_templates = [{'Cluster_id': template['Cluster_id'] + max_index_template, 'RC': template['RC'], 'Parent': key} for template in new_templates]
            max_index_template += len(new_templates)
            templates.extend(new_templates)
            
            for i, j in enumerate(index_dict[key]):
                cluster_indices_all[j] = cluster_indices_batch[i]

        return cluster_indices_all, templates

In [38]:
cluster_indices_1, templates_1 = process_child_level(its_graphs, cluster_indices_0, node_label_names, radius=1)

2024-06-01 09:02:59,751 - INFO - Processing templates with 1:
2024-06-01 09:02:59,785 - INFO - Processing templates with 1:
2024-06-01 09:02:59,797 - INFO - Processing templates with 1:
2024-06-01 09:02:59,846 - INFO - Processing templates with 1:
2024-06-01 09:02:59,855 - INFO - Processing templates with 1:
2024-06-01 09:02:59,859 - INFO - Processing templates with 1:
2024-06-01 09:02:59,860 - INFO - Processing templates with 1:
2024-06-01 09:02:59,861 - INFO - Processing templates with 1:
2024-06-01 09:02:59,906 - INFO - Processing templates with 1:
2024-06-01 09:02:59,909 - INFO - Processing templates with 1:
2024-06-01 09:02:59,921 - INFO - Processing templates with 1:
2024-06-01 09:02:59,925 - INFO - Processing templates with 1:
2024-06-01 09:02:59,926 - INFO - Processing templates with 1:
2024-06-01 09:02:59,936 - INFO - Processing templates with 1:
2024-06-01 09:02:59,938 - INFO - Processing templates with 1:
2024-06-01 09:02:59,944 - INFO - Processing templates with 1:
2024-06-

[0, 0, 0, 1, 2, 3, 4, 4, 0, 1, 2, 4, 5, 0, 0, 6, 3, 0, 7, 0, 0, 8, 4, 4, 4, 1, 0, 7, 3, 0, 0, 3, 9, 4, 1, 9, 5, 0, 10, 0, 0, 4, 3, 2, 4, 3, 1, 4, 11, 12, 11, 13, 9, 0, 0, 0, 0, 3, 0, 14, 12, 15, 15, 10, 12, 12, 15, 0]
[0, 1, 2, 3, 2, 4, 2, 2, 2, 2, 2, 2, 1, 2, 5, 6, 2]
[0, 0, 1, 2, 1, 1, 3, 1, 4, 0, 1, 5, 0, 0, 6, 0, 4, 0, 0, 7, 0, 0, 0, 0, 8, 0, 9, 0, 4, 10, 1, 0, 0, 11, 5, 1, 0, 0, 4, 4, 12, 9, 0, 13, 0, 4, 0, 0, 14, 0, 3, 1, 0, 0, 0, 1, 15, 3, 0, 1, 9, 16, 0, 0, 1, 0, 0, 12, 0, 0, 0, 0, 0, 0, 17, 1, 18, 0, 12, 1, 0, 0, 17, 19, 7, 0, 0, 4, 0, 4, 0, 1, 0, 17, 1, 1, 0, 1, 1, 0, 0, 7, 1, 0, 20, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 21, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 22, 1, 0, 21, 23, 1, 0, 0, 4, 7, 4, 1, 1, 21, 1, 18, 0, 0, 0, 0, 1, 0, 12, 0, 3, 1, 18, 0, 0, 0, 24, 0, 0, 0, 4, 17, 1, 0, 0]
[0, 1, 2, 2, 0, 0, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 1, 0, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
[0, 1]
[0, 1, 1, 0]
[0, 1, 2, 3, 

2024-06-01 09:02:59,987 - INFO - Processing templates with 1:
2024-06-01 09:02:59,988 - INFO - Processing templates with 1:
2024-06-01 09:02:59,989 - INFO - Processing templates with 1:
2024-06-01 09:02:59,991 - INFO - Processing templates with 1:
2024-06-01 09:03:00,000 - INFO - Processing templates with 1:
2024-06-01 09:03:00,002 - INFO - Processing templates with 1:
2024-06-01 09:03:00,005 - INFO - Processing templates with 1:
2024-06-01 09:03:00,007 - INFO - Processing templates with 1:
2024-06-01 09:03:00,011 - INFO - Processing templates with 1:
2024-06-01 09:03:00,012 - INFO - Processing templates with 1:
2024-06-01 09:03:00,012 - INFO - Processing templates with 1:
2024-06-01 09:03:00,015 - INFO - Processing templates with 1:
2024-06-01 09:03:00,020 - INFO - Processing templates with 1:
2024-06-01 09:03:00,023 - INFO - Processing templates with 1:
2024-06-01 09:03:00,024 - INFO - Processing templates with 1:
2024-06-01 09:03:00,025 - INFO - Processing templates with 1:
2024-06-

[0]
[0, 1, 2, 0, 3, 3, 0, 3, 2, 3]
[0, 1, 1, 1, 1]
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 2, 3, 0, 3, 4, 4, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0]
[0, 1, 2, 2, 2, 2]
[0, 1, 2, 1, 0, 3, 4]
[0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0]
[0]
[0]
[0, 1, 2, 1, 1, 3, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0]
[0, 0, 0]
[0, 0, 0]
[0, 0]
[0, 1, 2, 1, 2, 2, 1, 3, 3, 1, 3, 4, 4, 0]
[0, 1]
[0, 0, 1, 0]
[0]
[0]
[0, 1, 0, 2, 2, 3, 4, 2]
[0, 1, 2]
[0]
[0, 1, 2]
[0]
[0, 0]
[0, 1]
[0]
[0, 0, 1, 2, 1, 1]
[0]
[0, 0, 0, 0, 0]
[0]
[0, 1, 2, 2]
[0, 0, 0, 0, 0]
[0, 0, 0]
[0]
[0]
[0, 0, 1]
[0]
[0, 1, 1, 2]
[0, 1]
[0, 0]
[0, 0, 0]
[0, 0, 0, 0, 0]
[0, 1, 0, 2, 3, 0, 2]
[0, 1]
[0, 0, 0, 0, 0, 0, 0]
[0]
[0]
[0, 1]
[0, 1]
[0]
[0]
[0]
[0]
[0, 0]
[0]
[0, 1]
[0]
[0, 0]
[0]
[0, 1, 2]
[0]
[0]
[0, 1]
[0]
[0]
[0]
[0]
[0]
[0, 0]
[0, 1]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]


In [32]:
len(templates_1)

278

In [10]:
len(templates_1)

278

In [11]:
cluster_indices_2, templates_2 = process_graph_clusters(its_graphs, cluster_indices_1, node_label_names, radius=2)

2024-06-01 08:18:32,039 - INFO - Processing templates with 2:
2024-06-01 08:18:32,052 - INFO - Processing templates with 2:


2024-06-01 08:18:32,053 - INFO - Processing templates with 2:
2024-06-01 08:18:32,124 - INFO - Processing templates with 2:
2024-06-01 08:18:32,137 - INFO - Processing templates with 2:
2024-06-01 08:18:32,143 - INFO - Processing templates with 2:
2024-06-01 08:18:32,143 - INFO - Processing templates with 2:
2024-06-01 08:18:32,144 - INFO - Processing templates with 2:
2024-06-01 08:18:32,153 - INFO - Processing templates with 2:
2024-06-01 08:18:32,157 - INFO - Processing templates with 2:
2024-06-01 08:18:32,162 - INFO - Processing templates with 2:
2024-06-01 08:18:32,164 - INFO - Processing templates with 2:
2024-06-01 08:18:32,186 - INFO - Processing templates with 2:
2024-06-01 08:18:32,189 - INFO - Processing templates with 2:
2024-06-01 08:18:32,192 - INFO - Processing templates with 2:
2024-06-01 08:18:32,193 - INFO - Processing templates with 2:
2024-06-01 08:18:32,195 - INFO - Processing templates with 2:
2024-06-01 08:18:32,197 - INFO - Processing templates with 2:
2024-06-

[21, 3, 20, 4]
{1, 2, 3, 4, 5, 6, 7, 15, 17, 18, 19, 20, 21}
{1, 2, 3, 4, 5, 6, 7, 15, 17, 18, 19, 20, 21}
[8, 20, 21, 7]
{5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 20, 21}
{5, 6, 7, 8, 9, 10, 14, 15, 16, 17, 20, 21}
[30, 29, 6, 7]
{3, 5, 6, 7, 8, 9, 27, 28, 29, 30}
{3, 5, 6, 7, 8, 9, 27, 28, 29, 30}
[3, 36, 37, 6]
{1, 2, 3, 36, 37, 6, 4, 7, 5, 8}
{1, 2, 3, 36, 37, 6, 4, 7, 5, 8}
[8, 17, 18, 9]
{6, 7, 8, 9, 10, 11, 13, 14, 17, 18}
{6, 7, 8, 9, 10, 11, 13, 14, 17, 18}
[43, 2, 3, 44]
{1, 2, 3, 4, 5, 43, 44, 29, 30}
{1, 2, 3, 4, 5, 43, 44, 29, 30}
[40, 41, 5, 6]
{3, 4, 5, 6, 7, 40, 41, 8, 24, 25}
{3, 4, 5, 6, 7, 40, 41, 8, 24, 25}
[39, 13, 38, 14]
{38, 39, 11, 12, 13, 14, 15, 16, 24, 25}
{38, 39, 11, 12, 13, 14, 15, 16, 24, 25}
[21, 20, 5, 6]
{2, 3, 4, 5, 6, 7, 8, 18, 19, 20, 21}
{2, 3, 4, 5, 6, 7, 8, 18, 19, 20, 21}
[40, 17, 18, 39]
{32, 33, 39, 40, 14, 15, 16, 17, 18, 19, 20}
{32, 33, 39, 40, 14, 15, 16, 17, 18, 19, 20}
[24, 10, 11, 23]
{8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23, 24}
{8, 9,

2024-06-01 08:18:32,254 - INFO - Processing templates with 2:
2024-06-01 08:18:32,256 - INFO - Processing templates with 2:
2024-06-01 08:18:32,257 - INFO - Processing templates with 2:
2024-06-01 08:18:32,259 - INFO - Processing templates with 2:
2024-06-01 08:18:32,260 - INFO - Processing templates with 2:
2024-06-01 08:18:32,260 - INFO - Processing templates with 2:
2024-06-01 08:18:32,284 - INFO - Processing templates with 2:
2024-06-01 08:18:32,297 - INFO - Processing templates with 2:
2024-06-01 08:18:32,301 - INFO - Processing templates with 2:
2024-06-01 08:18:32,303 - INFO - Processing templates with 2:
2024-06-01 08:18:32,304 - INFO - Processing templates with 2:
2024-06-01 08:18:32,307 - INFO - Processing templates with 2:
2024-06-01 08:18:32,308 - INFO - Processing templates with 2:
2024-06-01 08:18:32,309 - INFO - Processing templates with 2:
2024-06-01 08:18:32,310 - INFO - Processing templates with 2:
2024-06-01 08:18:32,314 - INFO - Processing templates with 2:
2024-06-

[33, 18, 19, 34]
{33, 34, 16, 17, 18, 19, 20, 21, 22, 23}
{33, 34, 16, 17, 18, 19, 20, 21, 22, 23}
[33, 26, 25, 23]
{32, 33, 19, 21, 22, 23, 24, 25, 26, 27, 28}
{32, 33, 19, 21, 22, 23, 24, 25, 26, 27, 28}
[42, 20, 38, 39]
{17, 18, 19, 20, 21, 22, 36, 37, 38, 39, 40, 42}
{17, 18, 19, 20, 21, 22, 36, 37, 38, 39, 40, 42}
[16, 19, 27, 20]
{14, 15, 16, 17, 18, 19, 20, 21, 22, 26, 27}
{14, 15, 16, 17, 18, 19, 20, 21, 22, 26, 27}
[17, 10, 5, 9]
{2, 4, 5, 6, 7, 9, 10, 11, 12, 16, 17}
{2, 4, 5, 6, 7, 9, 10, 11, 12, 16, 17}
[32, 25, 18, 24]
{32, 16, 17, 18, 19, 20, 24, 25, 26, 27, 31}
{32, 16, 17, 18, 19, 20, 24, 25, 26, 27, 31}
[30, 31]
{32, 33, 28, 29, 30, 31}
{32, 33, 28, 29, 30, 31}
[5, 6]
{2, 3, 4, 5, 6, 7, 8, 16}
{2, 3, 4, 5, 6, 7, 8, 16}
[5, 6]
{2, 3, 4, 5, 6, 7, 8, 17}
{2, 3, 4, 5, 6, 7, 8, 17}
[2, 35, 34, 3]
{1, 2, 3, 35, 34, 4, 5}
{1, 2, 3, 35, 34, 4, 5}
[2, 3, 13, 14]
{1, 2, 3, 4, 5, 12, 13, 14}
{1, 2, 3, 4, 5, 12, 13, 14}
[16, 17, 2, 3]
{1, 2, 3, 4, 5, 16, 17}
{1, 2, 3, 4, 5, 16, 17

2024-06-01 08:18:32,456 - INFO - Processing templates with 2:
2024-06-01 08:18:32,456 - INFO - Processing templates with 2:
2024-06-01 08:18:32,458 - INFO - Processing templates with 2:
2024-06-01 08:18:32,458 - INFO - Processing templates with 2:
2024-06-01 08:18:32,459 - INFO - Processing templates with 2:
2024-06-01 08:18:32,459 - INFO - Processing templates with 2:
2024-06-01 08:18:32,459 - INFO - Processing templates with 2:
2024-06-01 08:18:32,460 - INFO - Processing templates with 2:
2024-06-01 08:18:32,460 - INFO - Processing templates with 2:
2024-06-01 08:18:32,461 - INFO - Processing templates with 2:
2024-06-01 08:18:32,462 - INFO - Processing templates with 2:
2024-06-01 08:18:32,463 - INFO - Processing templates with 2:
2024-06-01 08:18:32,464 - INFO - Processing templates with 2:
2024-06-01 08:18:32,464 - INFO - Processing templates with 2:
2024-06-01 08:18:32,465 - INFO - Processing templates with 2:
2024-06-01 08:18:32,466 - INFO - Processing templates with 2:
2024-06-

[19, 20, 6, 7]
{4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, 25, 26}
{4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, 25, 26}
[29, 27, 60, 28]
{34, 60, 23, 25, 26, 27, 28, 29, 30, 31}
{34, 60, 23, 25, 26, 27, 28, 29, 30, 31}
[16, 17, 6, 7]
{4, 5, 6, 7, 8, 9, 13, 14, 16, 17}
{4, 5, 6, 7, 8, 9, 13, 14, 16, 17}
[41, 42, 6, 7]
{4, 5, 6, 7, 8, 41, 42, 9, 24, 25}
{4, 5, 6, 7, 8, 41, 42, 9, 24, 25}
[2, 3, 4, 27, 28, 29]
{1, 2, 3, 4, 5, 6, 8, 9, 21, 22, 25, 26, 27, 28, 29}
{1, 2, 3, 4, 5, 6, 8, 9, 21, 22, 25, 26, 27, 28, 29}
[10, 11, 14, 15, 16, 17, 18]
{2, 3, 4, 10, 11, 12, 13, 14, 15, 16, 17, 18}
{2, 3, 4, 10, 11, 12, 13, 14, 15, 16, 17, 18}
[3, 14]
{1, 2, 3, 4, 5, 13, 14}
{1, 2, 3, 4, 5, 13, 14}
[6, 7, 8, 13, 14, 16, 17, 18]
{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
{3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}
[21, 20, 5, 14]
{3, 4, 5, 6, 7, 10, 11, 14, 15, 16, 17, 19, 20, 21}
{3, 4, 5, 6, 7, 10, 11, 14, 15, 16, 17, 19, 20, 21}
[32, 33, 13

In [12]:
cluster_indices_3, templates_3 = process_graph_clusters(its_graphs, cluster_indices_2, node_label_names, radius=3)

2024-06-01 08:18:45,943 - INFO - Processing templates with 3:
2024-06-01 08:18:45,945 - INFO - Processing templates with 3:
2024-06-01 08:18:45,946 - INFO - Processing templates with 3:
2024-06-01 08:18:45,947 - INFO - Processing templates with 3:
2024-06-01 08:18:45,948 - INFO - Processing templates with 3:
2024-06-01 08:18:45,949 - INFO - Processing templates with 3:
2024-06-01 08:18:45,950 - INFO - Processing templates with 3:
2024-06-01 08:18:45,950 - INFO - Processing templates with 3:
2024-06-01 08:18:45,951 - INFO - Processing templates with 3:
2024-06-01 08:18:45,952 - INFO - Processing templates with 3:
2024-06-01 08:18:45,953 - INFO - Processing templates with 3:
2024-06-01 08:18:45,958 - INFO - Processing templates with 3:
2024-06-01 08:18:45,959 - INFO - Processing templates with 3:
2024-06-01 08:18:45,959 - INFO - Processing templates with 3:
2024-06-01 08:18:45,960 - INFO - Processing templates with 3:
2024-06-01 08:18:45,961 - INFO - Processing templates with 3:
2024-06-

[21, 3, 20, 4]
{1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 17, 18, 19, 20, 21}
{1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 17, 18, 19, 20, 21}
[2, 3, 29, 30]
{1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}
{1, 2, 3, 4, 5, 6, 8, 9, 10, 12, 13, 14, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34}
[11, 10, 43, 42]
{7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 21, 39, 40, 41, 42, 43}
{7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 21, 39, 40, 41, 42, 43}
[8, 24, 9, 23]
{3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 22, 23, 24}
{3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 22, 23, 24}
[40, 18, 19, 39]
{15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 39, 40}
{15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 39, 40}
[1, 2, 14]
{1, 2, 3, 4, 5, 12, 13, 14}
{1, 2, 3, 4, 5, 12, 13, 14}
[12, 6, 7]
{3, 4, 5, 6, 7, 11, 12}
{3, 4, 5, 6, 7, 11, 12}
[1, 2, 4, 5, 11]
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
[16, 2, 3, 14]
{1, 2, 3, 4, 5, 6, 7, 9, 11, 12, 13, 14, 15, 16}
{1

2024-06-01 08:18:46,147 - INFO - Processing templates with 3:
2024-06-01 08:18:46,148 - INFO - Processing templates with 3:
2024-06-01 08:18:46,149 - INFO - Processing templates with 3:
2024-06-01 08:18:46,149 - INFO - Processing templates with 3:
2024-06-01 08:18:46,150 - INFO - Processing templates with 3:
2024-06-01 08:18:46,150 - INFO - Processing templates with 3:
2024-06-01 08:18:46,151 - INFO - Processing templates with 3:
2024-06-01 08:18:46,152 - INFO - Processing templates with 3:
2024-06-01 08:18:46,153 - INFO - Processing templates with 3:
2024-06-01 08:18:46,153 - INFO - Processing templates with 3:
2024-06-01 08:18:46,154 - INFO - Processing templates with 3:
2024-06-01 08:18:46,155 - INFO - Processing templates with 3:
2024-06-01 08:18:46,155 - INFO - Processing templates with 3:
2024-06-01 08:18:46,156 - INFO - Processing templates with 3:
2024-06-01 08:18:46,157 - INFO - Processing templates with 3:
2024-06-01 08:18:46,158 - INFO - Processing templates with 3:
2024-06-

[17, 18, 27, 1]
{1, 2, 3, 4, 6, 14, 15, 16, 17, 18, 19, 20, 21, 27}
{1, 2, 3, 4, 6, 14, 15, 16, 17, 18, 19, 20, 21, 27}
[48, 17, 45, 47]
{13, 14, 15, 16, 17, 18, 19, 40, 41, 42, 43, 44, 45, 46, 47, 48}
{13, 14, 15, 16, 17, 18, 19, 40, 41, 42, 43, 44, 45, 46, 47, 48}
[26, 29, 28, 45]
{10, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 45}
{10, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 45}
[10, 42, 45, 44]
{6, 7, 8, 9, 10, 11, 12, 37, 38, 39, 40, 41, 42, 43, 44, 45}
{6, 7, 8, 9, 10, 11, 12, 37, 38, 39, 40, 41, 42, 43, 44, 45}
[17, 27, 29, 30]
{13, 14, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27, 28, 29, 30}
{13, 14, 15, 16, 17, 18, 22, 23, 24, 25, 26, 27, 28, 29, 30}
[16, 10, 11, 15]
{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
[16, 27, 30, 15]
{12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 27, 28, 29, 30}
{12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 27, 28, 29, 30}
[17, 42, 14, 15]
{8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19

2024-06-01 08:18:46,349 - INFO - Processing templates with 3:
2024-06-01 08:18:46,350 - INFO - Processing templates with 3:
2024-06-01 08:18:46,350 - INFO - Processing templates with 3:
2024-06-01 08:18:46,388 - INFO - Processing templates with 3:
2024-06-01 08:18:46,388 - INFO - Processing templates with 3:
2024-06-01 08:18:46,389 - INFO - Processing templates with 3:
2024-06-01 08:18:46,390 - INFO - Processing templates with 3:
2024-06-01 08:18:46,390 - INFO - Processing templates with 3:
2024-06-01 08:18:46,391 - INFO - Processing templates with 3:
2024-06-01 08:18:46,392 - INFO - Processing templates with 3:
2024-06-01 08:18:46,392 - INFO - Processing templates with 3:
2024-06-01 08:18:46,393 - INFO - Processing templates with 3:
2024-06-01 08:18:46,393 - INFO - Processing templates with 3:
2024-06-01 08:18:46,393 - INFO - Processing templates with 3:
2024-06-01 08:18:46,394 - INFO - Processing templates with 3:
2024-06-01 08:18:46,395 - INFO - Processing templates with 3:
2024-06-

[1, 2, 20, 21, 22]
{1, 2, 3, 4, 5, 6, 16, 17, 18, 19, 20, 21, 22}
{1, 2, 3, 4, 5, 6, 16, 17, 18, 19, 20, 21, 22}
[4, 37, 36, 5]
{1, 2, 3, 4, 5, 6, 7, 8, 9, 26, 27, 28, 29, 36, 37}
{1, 2, 3, 4, 5, 6, 7, 8, 9, 26, 27, 28, 29, 36, 37}
[9, 10, 22, 23]
{6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
{6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}
[13, 12, 37, 38]
{3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 34, 35, 36, 37, 38, 39, 40, 41, 43, 44, 45}
{3, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 34, 35, 36, 37, 38, 39, 40, 41, 43, 44, 45}
[8, 17, 16, 7]
{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}
[40, 9, 10, 37]
{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 37, 38, 39, 40}
{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 37, 38, 39, 40}
[24, 25, 35, 36]
{21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 36}
{21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 35, 36}
[23, 13, 14, 22

In [244]:
len(templates_1)

228

In [80]:
graph_dict, index_dict = split_graphs_by_class_and_indices(its_graphs, cluster_indices_1)

In [81]:
index_dict[76]

[26,
 56,
 142,
 194,
 206,
 247,
 250,
 409,
 448,
 521,
 526,
 528,
 568,
 615,
 676,
 693,
 696,
 734,
 741,
 742,
 757,
 782,
 849,
 856,
 877,
 881,
 903]

In [82]:
bug = [value for key, value in enumerate(its_graphs) if key in index_dict[76]]

In [84]:
len(bug)

27

In [78]:
cluster_indices_2, templates_2 = process_graph_clusters(its_graphs, cluster_indices_1, node_label_names, radius=2)

2024-05-31 21:02:48,257 - INFO - Processing templates with 2:


0 34


2024-05-31 21:02:50,073 - INFO - Processing templates with 2:
2024-05-31 21:02:50,074 - INFO - Processing templates with 2:


7 1
11 83


2024-05-31 21:02:50,595 - INFO - Processing templates with 2:
2024-05-31 21:02:50,746 - INFO - Processing templates with 2:
2024-05-31 21:02:50,755 - INFO - Processing templates with 2:
2024-05-31 21:02:50,756 - INFO - Processing templates with 2:
2024-05-31 21:02:50,756 - INFO - Processing templates with 2:
2024-05-31 21:02:50,778 - INFO - Processing templates with 2:
2024-05-31 21:02:50,782 - INFO - Processing templates with 2:


29 19
32 21
34 1
36 2
38 24
53 12
55 12


2024-05-31 21:02:50,851 - INFO - Processing templates with 2:


12 47


2024-05-31 21:02:52,595 - INFO - Processing templates with 2:
2024-05-31 21:02:52,603 - INFO - Processing templates with 2:
2024-05-31 21:02:52,626 - INFO - Processing templates with 2:
2024-05-31 21:02:52,627 - INFO - Processing templates with 2:
2024-05-31 21:02:52,655 - INFO - Processing templates with 2:


39 7
62 9
66 1
68 24
30 9


2024-05-31 21:02:53,292 - INFO - Processing templates with 2:
2024-05-31 21:02:53,296 - INFO - Processing templates with 2:
2024-05-31 21:02:53,297 - INFO - Processing templates with 2:
2024-05-31 21:02:53,478 - INFO - Processing templates with 2:
2024-05-31 21:02:53,490 - INFO - Processing templates with 2:


73 5
31 3
40 22
1 5
74 7
76 27


2024-05-31 21:02:53,495 - INFO - Processing templates with 2:


KeyboardInterrupt: 

In [75]:
len(cluster_indices_1)

1000

In [71]:
len(its_graphs)

1000

In [72]:
len(cluster_indices_1)

228

In [60]:

graph_dict, index_dict = split_graphs_by_class_and_indices(its_graphs[:1000], cluster_indices_0)
templates = []
cluster_indice_all = ['a' for i in range(len(its_graphs[:1000]))]
max_index_template = 0
for key, value in graph_dict.items():
    
    cluster_indices_batch, new_templates = hcl.process_level(value,2,nodeLabelNames=node_label_names,
                            nodeLabelDefault=["*", 0],
                            edgeAttribute="order", templates=None, update_template=True)
    cluster_indices_batch = [i + max_index_template for i in cluster_indices_batch]
    new_templates = [{'Cluster_id': value['Cluster_id']+max_index_template, 'RC': value['RC'], 'Parent': key} for value in new_templates]
    max_index_template += len(new_templates)
    templates.extend(new_templates)
    key_index = index_dict[key]

    for i, j in enumerate(key_index):
        cluster_indice_all[j] = cluster_indices_batch[i]
    

2024-05-31 20:48:36,936 - INFO - Processing templates with 2:
2024-05-31 20:48:47,017 - INFO - Processing templates with 2:


KeyboardInterrupt: 

In [63]:
cluster_indices_all, templates = process_graph_clusters(its_graphs, cluster_indices_0, node_label_names, radius=1)

2024-05-31 20:54:17,912 - INFO - Processing templates with 1:
2024-05-31 20:54:17,939 - INFO - Processing templates with 1:
2024-05-31 20:54:17,953 - INFO - Processing templates with 1:
2024-05-31 20:54:18,013 - INFO - Processing templates with 1:
2024-05-31 20:54:18,023 - INFO - Processing templates with 1:
2024-05-31 20:54:18,027 - INFO - Processing templates with 1:
2024-05-31 20:54:18,028 - INFO - Processing templates with 1:
2024-05-31 20:54:18,030 - INFO - Processing templates with 1:
2024-05-31 20:54:18,085 - INFO - Processing templates with 1:
2024-05-31 20:54:18,088 - INFO - Processing templates with 1:
2024-05-31 20:54:18,104 - INFO - Processing templates with 1:
2024-05-31 20:54:18,109 - INFO - Processing templates with 1:
2024-05-31 20:54:18,110 - INFO - Processing templates with 1:
2024-05-31 20:54:18,121 - INFO - Processing templates with 1:
2024-05-31 20:54:18,123 - INFO - Processing templates with 1:
2024-05-31 20:54:18,129 - INFO - Processing templates with 1:
2024-05-

In [66]:
templates[-1]

{'Cluster_id': 227,
 'RC': <networkx.classes.graph.Graph at 0x14f254d90>,
 'Parent': 104}

In [59]:
pd.DataFrame(cluster_indice_all)

Unnamed: 0,0
0,0
1,7
2,11
3,29
4,32
...,...
995,11
996,11
997,182
998,124


In [49]:
templates[-1]

{'Cluster_id': 0,
 'RC': <networkx.classes.graph.Graph at 0x14fbd6a10>,
 'Parent': 104}

In [31]:
cluster_indices_batch

[0]

In [28]:
len(templates)

67

In [24]:
templates

[{'Cluster_id': 0, 'RC': <networkx.classes.graph.Graph at 0x126906b50>},
 {'Cluster_id': 1, 'RC': <networkx.classes.graph.Graph at 0x14f9f7190>},
 {'Cluster_id': 2, 'RC': <networkx.classes.graph.Graph at 0x14fb26090>},
 {'Cluster_id': 3, 'RC': <networkx.classes.graph.Graph at 0x14fa576d0>},
 {'Cluster_id': 4, 'RC': <networkx.classes.graph.Graph at 0x14ff74910>},
 {'Cluster_id': 5, 'RC': <networkx.classes.graph.Graph at 0x14fe43990>},
 {'Cluster_id': 6, 'RC': <networkx.classes.graph.Graph at 0x125ccc090>},
 {'Cluster_id': 0, 'RC': <networkx.classes.graph.Graph at 0x14fcfda50>},
 {'Cluster_id': 1, 'RC': <networkx.classes.graph.Graph at 0x14ff94850>},
 {'Cluster_id': 2, 'RC': <networkx.classes.graph.Graph at 0x137119890>},
 {'Cluster_id': 3, 'RC': <networkx.classes.graph.Graph at 0x14ffa7f10>},
 {'Cluster_id': 0, 'RC': <networkx.classes.graph.Graph at 0x14ffcbfd0>},
 {'Cluster_id': 1, 'RC': <networkx.classes.graph.Graph at 0x14ffd4050>},
 {'Cluster_id': 2, 'RC': <networkx.classes.graph.Gr

In [132]:
pd.concat([pd.DataFrame(cluster_indices_0), pd.DataFrame(cluster_indice_all)], axis=1)

Unnamed: 0,0,0.1
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
995,2,0
996,2,0
997,66,1
998,30,1


In [127]:
cluster_indice_all

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 2,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 3,
 4,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 2,
 2,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 5,
 1,
 0,
 6,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 2,
 7,
 0,
 0,
 0,
 2,
 3,
 0,
 1,
 2,
 1,
 0,
 3,
 0,
 0,
 0,
 2,
 0,
 2,
 2,
 1,
 3,
 0,
 3,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 2,
 0,
 1,
 1,
 4,
 0,
 1,
 0,
 0,
 0,
 6,
 2,
 0,
 0,
 0,
 5,
 0,
 8,
 3,
 2,
 3,
 0,
 0,
 3,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 1,
 3,
 0,
 0,
 0,
 0,
 0,
 9,
 0,
 0,
 4,
 0,
 0,
 2,
 0,
 0,
 9,
 0,
 6,
 0,
 0,
 2,
 7,
 3,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 1,
 1,
 0,
 0,
 2,
 3,
 1,
 8,
 0,
 0,
 1,
 1,
 1,
 2,
 3,
 0,
 0,
 0,
 3,
 0,
 0,
 9,
 1,
 1,
 2,
 0,
 1,
 3,
 2,
 0,
 10,
 1,
 0,
 0,
 0,
 2,
 4,
 5,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 8,
 0,
 7,
 0,
 0,
 3,
 2,
 1,
 0,
 2,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 10,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 11,
 6,
 1,
 1,
 0,
 0,
 1,
 

In [123]:
templates

[]

In [114]:
len(templates)

67

In [71]:
cluster_indices_1[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 0, 8]

In [73]:
cluster_indice_all[:10]

[0, 6, 0, 2, 9, 10, 6, 0, 0, 11]

In [76]:
pd.DataFrame(cluster_indices_1).value_counts()

0  
2      83
10     47
0      34
40     34
21     27
       ..
126     1
125     1
13      1
122     1
227     1
Name: count, Length: 228, dtype: int64

In [75]:
pd.DataFrame(cluster_indice_all).value_counts()

0 
0     375
2     201
4      80
7      52
23     37
13     33
6      31
9      30
17     26
15     18
11     13
19     13
8      12
10     11
18     10
20      8
28      7
29      7
1       5
3       5
24      5
25      5
12      4
26      4
5       2
27      2
16      1
14      1
21      1
22      1
Name: count, dtype: int64

In [4]:
#from SynTemp.SynRule.hierarchical_clustering import HierarchicalClustering
node_label_names = ["element", "charge"]
hcl = HierarchicalClustering(node_label_names=node_label_names,
                            node_label_default=["*", 0],
                            edge_attribute="order",
                            max_radius=1)

reaction_dicts, templates = hcl.fit(data[:])

2024-05-31 15:06:54,193 - INFO - Processing without templates
2024-05-31 15:06:54,194 - INFO - Processing 100 data to get templates
2024-05-31 15:06:54,195 - INFO - Processing templates with 0:
2024-05-31 15:06:54,265 - INFO - Processing templates with 1:
2024-05-31 15:06:54,412 - INFO - Processing other data with new templates
2024-05-31 15:06:54,413 - INFO - Processing templates with 0:


Processing templates with 0:
Processing templates with 1:
Processing templates with 0:


2024-05-31 15:07:19,944 - INFO - Processing templates with 1:


Processing templates with 1:


In [9]:
node_label_names = ["element", "charge"]
hcl = HierarchicalClustering(node_label_names=node_label_names,
                            node_label_default=["*", 0],
                            edge_attribute="order",
                            max_radius=1)

reaction_dicts, templates = hcl.fit(data[:], root_sample=1000)

2024-05-31 15:16:17,696 - INFO - Processing without templates
2024-05-31 15:16:17,698 - INFO - Processing 1000 data to get templates
2024-05-31 15:16:17,698 - INFO - Processing templates with 0:
2024-05-31 15:16:18,496 - INFO - Processing templates with 1:
2024-05-31 15:16:20,842 - INFO - Processing other data with new templates
2024-05-31 15:16:20,843 - INFO - Processing templates with 0:
2024-05-31 15:16:31,918 - INFO - Processing templates with 1:


In [7]:
pd.DataFrame(reaction_dicts)

Unnamed: 0,R-id,ITSGraph,GraphRules,Cluster_R0,Cluster_R1,Reaction Type,Rings
0,48358,"((1, 2, 3, 20, 18, 17, 19, 4, 5, 6, 7, 8, 9, 1...","((4, 3, 20, 21), (4, 3, 20, 21), (4, 3, 20, 21))",0,0,Single Cyclic,[4]
1,27953,"((1, 2, 29, 28, 26, 25, 27, 32, 33, 34, 31, 24...","((2, 3, 29, 30), (2, 3, 29, 30), (2, 3, 29, 30))",1,1,Single Cyclic,[4]
2,46325,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 40, 41, 24, 2...","((43, 10, 11, 42), (43, 10, 11, 42), (43, 10, ...",2,2,Single Cyclic,[4]
3,39695,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","((40, 18, 19, 39), (40, 18, 19, 39), (40, 18, ...",3,3,Single Cyclic,[4]
4,49705,"((13, 14, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1...","((1, 2, 14), (1, 2, 14), (1, 2, 14))",4,4,Acyclic,[]
...,...,...,...,...,...,...,...
34580,500,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","((24, 29, 30, 23), (24, 29, 30, 23), (24, 29, ...",0,0,Single Cyclic,[4]
34581,25189,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 21, 22, 23, 1...","((24, 10, 11, 23), (24, 10, 11, 23), (24, 10, ...",0,16,Single Cyclic,[4]
34582,36852,"((1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...","((26, 43, 42, 28), (26, 43, 42, 28), (26, 43, ...",0,0,Single Cyclic,[4]
34583,21990,"((17, 16, 18, 19, 15, 13, 14, 12, 11, 9, 10, 3...","((8, 9, 37, 38), (8, 9, 37, 38), (8, 9, 37, 38))",0,0,Single Cyclic,[4]
