In [378]:
import os
import json


In [379]:
from typing import Dict, Any, List, Optional

class TreeNode:
    """
    Represents a single node in the hierarchical tree structure.
    """
    def __init__(
        self,
        id: str,
        label: str,
        category: str,
        primaryGroup: str,
        description: str,
        children: Optional[List['TreeNode']] = None,
    ):
        """
        Initializes a new tree node with data attributes.

        Args:
            id (str): The unique identifier for the node.
            label (str): The human-readable label or name.
            category (str): The category of the node.
            primaryGroup (str): The primary group classification.
            children (Optional[List['TreeNode']]): A list of child TreeNode objects.
        """
        self.id = id
        self.label = label
        self.category = category
        self.primaryGroup = primaryGroup
        self.description = description
        self.children: List['TreeNode'] = children if children is not None else []

    def __repr__(self):
        """
        Provides a string representation for debugging.
        """
        return f"TreeNode(id='{self.id}', label='{self.label}', children={len(self.children)})"

    def get_child(self, label):
        for child in self.children:
            if child.label == label:
                return child
        

    def add_children(self, labels, descriptions):
        for i, label in enumerate(labels):
            child = TreeNode(id='0_2_3_4', 
                            label=label, 
                            category=self.label, 
                            primaryGroup=self.category, 
                            description = descriptions[i])
            self.children.append(child)
        

    def print_tree(self, level=0):
        """
        Recursively prints the structure of the tree.
        """
        indent = "  " * level
        print(f"{indent}- {self.label} (ID: {self.id})")
        for child in self.children:
            child.print_tree(level + 1)

In [380]:
def build_dict(tree: TreeNode) -> dict:
    "Recursively builds the new dictionary from the tree"

    dictionary = {}
    for label in ["id", "label", "category", "primaryGroup"]:
        dictionary[label] = getattr(tree, label)
    if tree.children:
        dictionary["children"] = dict([(child.id, build_dict(child)) for child in tree.children])
    return dictionary
    

In [381]:
def build_tree(data: Dict[str, Any]) -> TreeNode:
    """
    Recursively builds the TreeNode structure from the raw dictionary data.

    Args:
        data (Dict[str, Any]): The raw dictionary data for a single node.

    Returns:
        TreeNode: The resulting object-oriented tree node.
    """
    # Extract core attributes
    node_id = data.get('id', '')
    label = data.get('label', '')
    category = data.get('category', '')
    primaryGroup = data.get('primaryGroup', 'cancer-type')
    description = data.get('description', '')

    # Initialize the current node
    current_node = TreeNode(
        id=node_id,
        label=label,
        category=category,
        primaryGroup=primaryGroup,
        description=description
    )

    # Check for children and recurse
    raw_children = data.get('children', {})
    if raw_children and isinstance(raw_children, dict):
        # Iterate over the values (the child dictionaries)
        for child_dict in raw_children.values():
            # Recursively build the child node
            child_node = build_tree(child_dict)
            current_node.children.append(child_node)

    return current_node

In [382]:
dummy_path = "../src/utils/dummy_data"
utils_path = "../src/utils"

In [383]:
assert(os.path.isdir(dummy_path))

In [419]:
with open(os.path.join(utils_path, "longer_filter_data.js")) as f:
    filters = eval("".join(f.read().split("\n")[1:-1])[:-1])

In [420]:
cancer_tree = build_tree(filters["0_0"])
access_tree = build_tree(filters["0_1"])

In [421]:
data_tree = build_tree(filters["0_2"])

In [423]:
def pop_label(node, label):
    for i in range(len(node.children)):
        if node.children[i].label == label:
            node.children.pop(i)
            return node
    return node

In [426]:
data_tree.print_tree()

- dataType (ID: 0_2)
  - Biobank Samples (ID: 0_2_0)
    - Material type (ID: 0_2_0_0)
      - Bloods  (ID: 0_2_0_0_0)
      - Cells - eg cell lines (ID: 0_2_0_0_1)
      - Genetic material (ID: 0_2_0_0_2)
      - Other Fluids - eg urine (ID: 0_2_0_0_3)
      - Organoids (ID: 0_2_0_0_4)
      - Tissues - eg Bone marrow aspirate (ID: 0_2_0_0_5)
      - Other - eg swab (ID: 0_2_0_0_6)
    - State (ID: 0_2_0_1)
      - Malignant (ID: 0_2_0_1_0)
      - Normal (ID: 0_2_0_1_1)
      - Pre-cancerous (ID: 0_2_0_1_2)
  - In Vitro Study (ID: 0_2_1)
    - Model (ID: 0_2_1_0)
      - Organ on a Chip (ID: 0_2_1_0_0)
      - 3D organoid (including on a chip) (ID: 0_2_1_0_1)
      - Organ slice (ID: 0_2_1_0_2)
    - Cell Source (ID: 0_2_1_1)
      - Immortalised cell-line (ID: 0_2_1_1_0)
      - Patient derived (ID: 0_2_1_1_1)
      - Model Organism derived (ID: 0_2_1_1_2)
    - Treatment (ID: 0_2_1_2)
      - Cell and cell-derived treatment (ID: 0_2_1_2_0)
      - Gene knock-down (ID: 0_2_1_2_1)
  

In [433]:
model = data_tree.get_child("Model Organism")

In [429]:
patient = data_tree.get_child("Patient study")
multi = patient.get_child("Multi-omic Data")
image = patient.get_child("Imaging Data")

In [431]:
techniques = data_tree.get_child("Techniques")
co,cr,sp, me, na = techniques.children 

In [432]:
techniques.children = [co, cr,image, me, multi, na, sp]

In [434]:
for node in [patient, model]:
    for label in ["Multi-omic Data", "Imaging Data"]:
        node = pop_label(node, label)

In [437]:
for node in [multi, image]:
    node.category = techniques.label
    node.primaryGroup = techniques.label

In [438]:
data_tree = change_id(data_tree, data_tree.id)

In [439]:
data_tree.print_tree()

- dataType (ID: 0_2)
  - Biobank Samples (ID: 0_2_0)
    - Material type (ID: 0_2_0_0)
      - Bloods  (ID: 0_2_0_0_0)
      - Cells - eg cell lines (ID: 0_2_0_0_1)
      - Genetic material (ID: 0_2_0_0_2)
      - Other Fluids - eg urine (ID: 0_2_0_0_3)
      - Organoids (ID: 0_2_0_0_4)
      - Tissues - eg Bone marrow aspirate (ID: 0_2_0_0_5)
      - Other - eg swab (ID: 0_2_0_0_6)
    - State (ID: 0_2_0_1)
      - Malignant (ID: 0_2_0_1_0)
      - Normal (ID: 0_2_0_1_1)
      - Pre-cancerous (ID: 0_2_0_1_2)
  - In Vitro Study (ID: 0_2_1)
    - Model (ID: 0_2_1_0)
      - Organ on a Chip (ID: 0_2_1_0_0)
      - 3D organoid (including on a chip) (ID: 0_2_1_0_1)
      - Organ slice (ID: 0_2_1_0_2)
    - Cell Source (ID: 0_2_1_1)
      - Immortalised cell-line (ID: 0_2_1_1_0)
      - Patient derived (ID: 0_2_1_1_1)
      - Model Organism derived (ID: 0_2_1_1_2)
    - Treatment (ID: 0_2_1_2)
      - Cell and cell-derived treatment (ID: 0_2_1_2_0)
      - Gene knock-down (ID: 0_2_1_2_1)
  

In [440]:
multi.id

'0_2_4_4'

In [441]:
omics_node = TreeNode(id=multi.id, 
                      label=multi.label, 
                      category=multi.category, 
                      primaryGroup=multi.primaryGroup, 
                      description="Omic techniques such as genomics / proteomics" )

In [442]:
layers = ["Biological molecules (eg DNA)", "Source", "Spatial resolution"]
descriptions = ["genomics/proteomics etc", "eg. control/tumour", "single cell/spatial etc"]
omics_node.add_children(layers, descriptions)

In [351]:
omics_node.children

[TreeNode(id='0_2_3_4_0', label='Biological molecules (eg DNA)', children=0),
 TreeNode(id='0_2_3_4_1', label='Source', children=0),
 TreeNode(id='0_2_3_4_2', label='Spatial resolution', children=0)]

In [443]:
sources = ["Control", "Liquid Biopsy", "Other", "Tumour"]
descriptions = ["healthy tissue", "circulating tumour cells (CTCs), exosomes etc.", "e.g. environmental", "primary and secondary tumours"]

omics_node.get_child("Source").add_children(sources,descriptions)

bios = sorted(["Proteomics", "Transcriptomics", "Epigenomics", "Metabolomics", "Metagenomics", "Genomics"])

descriptions = ["chemical modifications to DNA/histones",
                "exomes/genomes",                 
                "small molecules (sugars, amino acids) produced by metabolism",
                "genetic material from multiple organisms (usually microbes) living together",
                "functional molecules that carry out cellular processes",
                "RNA transcripts of DNA",
                ]

omics_node.get_child("Biological molecules (eg DNA)").add_children(bios,descriptions)

layers = ["bulk", "single cell", "spatial"]
descriptions = ["tumour average - may include non-tumour cells", "specific cells", "includes positional information"]

omics_node.get_child('Spatial resolution').add_children(layers, descriptions)


In [444]:
omics_node = change_id(omics_node, omics_node.id)

In [445]:
omics_node.print_tree()

- Multi-omic Data (ID: 0_2_4_4)
  - Biological molecules (eg DNA) (ID: 0_2_4_4_0)
    - Epigenomics (ID: 0_2_4_4_0_0)
    - Genomics (ID: 0_2_4_4_0_1)
    - Metabolomics (ID: 0_2_4_4_0_2)
    - Metagenomics (ID: 0_2_4_4_0_3)
    - Proteomics (ID: 0_2_4_4_0_4)
    - Transcriptomics (ID: 0_2_4_4_0_5)
  - Source (ID: 0_2_4_4_1)
    - Control (ID: 0_2_4_4_1_0)
    - Liquid Biopsy (ID: 0_2_4_4_1_1)
    - Other (ID: 0_2_4_4_1_2)
    - Tumour (ID: 0_2_4_4_1_3)
  - Spatial resolution (ID: 0_2_4_4_2)
    - bulk (ID: 0_2_4_4_2_0)
    - single cell (ID: 0_2_4_4_2_1)
    - spatial (ID: 0_2_4_4_2_2)


In [448]:
data_tree.get_child('Techniques').children[-3] = omics_node

In [472]:
data_tree = change_id(data_tree, data_tree.id)

In [452]:
data_tree.print_tree()

- dataType (ID: 0_2)
  - Biobank Samples (ID: 0_2_0)
    - Material type (ID: 0_2_0_0)
      - Bloods  (ID: 0_2_0_0_0)
      - Cells - eg cell lines (ID: 0_2_0_0_1)
      - Genetic material (ID: 0_2_0_0_2)
      - Other Fluids - eg urine (ID: 0_2_0_0_3)
      - Organoids (ID: 0_2_0_0_4)
      - Tissues - eg Bone marrow aspirate (ID: 0_2_0_0_5)
      - Other - eg swab (ID: 0_2_0_0_6)
    - State (ID: 0_2_0_1)
      - Malignant (ID: 0_2_0_1_0)
      - Normal (ID: 0_2_0_1_1)
      - Pre-cancerous (ID: 0_2_0_1_2)
  - In Vitro Study (ID: 0_2_1)
    - Model (ID: 0_2_1_0)
      - Organ on a Chip (ID: 0_2_1_0_0)
      - 3D organoid (including on a chip) (ID: 0_2_1_0_1)
      - Organ slice (ID: 0_2_1_0_2)
    - Cell Source (ID: 0_2_1_1)
      - Immortalised cell-line (ID: 0_2_1_1_0)
      - Patient derived (ID: 0_2_1_1_1)
      - Model Organism derived (ID: 0_2_1_1_2)
    - Treatment (ID: 0_2_1_2)
      - Cell and cell-derived treatment (ID: 0_2_1_2_0)
      - Gene knock-down (ID: 0_2_1_2_1)
  

In [None]:
omics = data_tree.get_child("Techniques").get_child("Multi-omic Data")

bio = omics.get_child('Biological molecules (eg DNA)')
geo = bio.get_child("Genomics")


labels = ["SNVs", "Indels", "CNVs", "Fusion Genes"]
descriptions = ["single base changes", "small insertions or deletions",
                "large scale amplifications/deletions","chromosomal translocations"]

geo.add_children(labels, descriptions)

In [473]:
geo.print_tree()

- Genomics (ID: 0_2_4_4_0_1)
  - SNVs (ID: 0_2_4_4_0_1_0)
  - Indels (ID: 0_2_4_4_0_1_1)
  - CNVs (ID: 0_2_4_4_0_1_2)
  - Fusion Genes (ID: 0_2_4_4_0_1_3)


In [474]:
new_filters = {"0_0" : build_dict(cancer_tree),
               "0_1" : build_dict(access_tree),
               "0_2" : build_dict(data_tree)}


In [475]:
with open(os.path.join(utils_path, "longer_filter_data.js"), "w") as f:
    f.write("const theFilters =\n")
    json.dump(new_filters, f)
    f.write(";\nexport const filterData = theFilters;")

In [144]:
data_dictionary = build_dict(data_tree)

In [468]:
[b.label for b in bio.children]

['Epigenomics',
 'Genomics',
 'Metabolomics',
 'Metagenomics',
 'Proteomics',
 'Transcriptomics']

In [146]:
filters["0_2"] = data_dictionary

In [147]:
with open("filters.json", "w") as f:
    json.dump(filters, f)

In [140]:
def change_id(node, identity):
    node.id = identity
    if not node.children:
        return node
    else:
        old_children = node.children.copy()
        children = []
        for i, child in enumerate(old_children): 
            iden = f"{identity}_{i}"
            child = change_id(child, iden)
            children.append(child)
        node.children = children
    return node

In [141]:
data_tree = change_id(data_tree, data_tree.id)

In [143]:
data_tree.children[-1].children[2].children

[TreeNode(id='0_2_4_2_0', label='Bioluminescence Imaging', children=0),
 TreeNode(id='0_2_4_2_1', label='Fluorescence Imaging', children=0),
 TreeNode(id='0_2_4_2_2', label='Magnetic Resonance Imaging', children=0),
 TreeNode(id='0_2_4_2_3', label='Medical photography', children=0),
 TreeNode(id='0_2_4_2_4', label='Microscopy', children=0),
 TreeNode(id='0_2_4_2_5', label='Nuclear medicine imaging', children=0),
 TreeNode(id='0_2_4_2_6', label='Radiographic imaging', children=0),
 TreeNode(id='0_2_4_2_7', label='Ultrasonography', children=0)]