In [None]:
import pandas as pd
from ppkt2synergy import (
    load_hpo, CohortDataLoader, PhenopacketMatrixGenerator,
    HPOMatrixProcessor, SynergyTreeBuilder, SynergyTreeVisualizer
)
from ppktstore.registry import configure_phenopacket_registry
from gpsea.preprocessing import configure_caching_cohort_creator, load_phenopackets
from gpsea.analysis.predicate import variant_effect
from gpsea.analysis.clf import monoallelic_classifier
from gpsea.model import VariantEffect
import os
from typing import List, Union

def visualize_synergy_tree_for_variant_class(
    gene_symbol: str,
    mane_tx_id: str,
    variant_effect_type: VariantEffect,
    output_dir: str = "synergy_tree/tree_image",
    max_k: int = 3,
    nan_strategy: str= 'drop', 
    threshold: float = 0.7,
    phenopacket_store_version: str = None,
    title: str =None
):
    """
    Build and visualize a synergy tree for a given gene and VariantEffect.

    Parameters:
        gene_symbol(str): e.g., 'TGFB2'
        mane_tx_id (str): e.g., 'NM_003238.6'
        variant_effect_type (VariantEffect): e.g., VariantEffect.MISSENSE_VARIANT
        output_dir (str): directory to store the output .pdf/.dot
        max_k (int): maximum synergy size
        threshold (float): HPO frequency threshold
        phenopacket_store_version (str): e.g., '0.1.24'
    """

    label = str(variant_effect_type)  # e.g. 'missense_variant'

    # Load data
    hp_path = os.path.join("..","..","tests", "data", "hp.json")
    hpo = load_hpo(file=hp_path)
    phenopacket_registry = configure_phenopacket_registry()
    with phenopacket_registry.open_phenopacket_store(phenopacket_store_version) as ps:
        phenopackets=tuple(ps.iter_cohort_phenopackets(gene_symbol))
    cohort_creator = configure_caching_cohort_creator(hpo)
    cohort, _ = load_phenopackets(phenopackets=phenopackets, cohort_creator=cohort_creator)

    # Create target matrix
    predicate = variant_effect(variant_effect_type, tx_id=mane_tx_id)
    clf = monoallelic_classifier(
        a_predicate=predicate,
        b_predicate=~predicate,
        a_label=label,
        b_label="other"
    )
    variant_matrix = pd.DataFrame(
        data=[1 if (cat := clf.test(p)) and cat.category.name == label else 0 for p in cohort],
        index=[p.labels._meta_label for p in cohort],
        columns=[label]
    )

    # Generate HPO matrix
    dataloader = CohortDataLoader.from_ppkt_store(gene_symbol)
    matrix_generator = PhenopacketMatrixGenerator(dataloader, external_target_matrix=variant_matrix)
    hpo_matrix, target_matrix = HPOMatrixProcessor.filter_hpo_matrix(matrix_generator, threshold=threshold,nan_strategy=nan_strategy)

    # Build and visualize tree
    builder = SynergyTreeBuilder(hpo_matrix, target_matrix.iloc[:, 0], max_k=max_k)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"{title}_tree")
    
    SynergyTreeVisualizer(builder.build()).visualize(title, filename=output_path)


Loeys-Dietz syndrome-1 (LDS1)

In [41]:
visualize_synergy_tree_for_variant_class(gene_symbol='TGFBR1',mane_tx_id='NM_004612.4',variant_effect_type=VariantEffect.MISSENSE_VARIANT,threshold=0.75,title='TGFBR1_LDS1_missense_synergy')

Individuals Processed: 100%|██████████| 41/41 [00:00<00:00, 129.67 individuals/s]




Loeys-Dietz syndrome-2 (LDS2)

In [None]:
visualize_synergy_tree_for_variant_class(gene_symbol='TGFBR2',mane_tx_id='NM_003242.6',variant_effect_type=VariantEffect.MISSENSE_VARIANT,threshold=0.6,title='TGFBR2_LDS2_missense_synergy')

Loeys-Dietz syndrome-3 (LDS3)

In [4]:
visualize_synergy_tree_for_variant_class(gene_symbol='SMAD3',mane_tx_id='NM_005902.4',variant_effect_type=VariantEffect.MISSENSE_VARIANT,threshold=0.8,title='SMAD3_LDS3_missense_synergy')

Individuals Processed: 100%|██████████| 49/49 [00:00<00:00, 297.54 individuals/s]




Loeys-Dietz syndrome-4 (LDS4)

In [6]:
visualize_synergy_tree_for_variant_class(gene_symbol='TGFB2',mane_tx_id='NM_003238.6',variant_effect_type=VariantEffect.MISSENSE_VARIANT,threshold=0.9,title='TGFB2_LDS4_missense_synergy')

Individuals Processed: 100%|██████████| 36/36 [00:01<00:00, 20.97 individuals/s]




Loeys-Dietz syndrome 5 (LDS5)

In [25]:
visualize_synergy_tree_for_variant_class(gene_symbol='TGFB3',mane_tx_id='NM_003239.5',variant_effect_type=VariantEffect.MISSENSE_VARIANT,threshold=0.75,title='TGFB3_LDS5_missense_synergy')

Individuals Processed: 100%|██████████| 75/75 [00:00<00:00, 103.31 individuals/s]




Loeys-Dietz syndrome 6 (LDS6)

In [None]:
visualize_synergy_tree_for_variant_class(gene_symbol='SMAD2',mane_tx_id='NM_005901.6',variant_effect_type=VariantEffect.MISSENSE_VARIANT,threshold=0.4,title='SMAD2_LDS6_missense_synergy')

In [None]:
phenopackets = CohortDataLoader.from_ppkt_store(['TGFBR1','TGFBR2','SMAD3','TGFB2','TGFB3','SMAD2'])
matrix_gen = PhenopacketMatrixGenerator(phenopackets)
hpo_matrix, target_matrix = HPOMatrixProcessor.filter_hpo_matrix(matrix_gen, threshold=0.5,nan_strategy='drop')

In [None]:
phenopackets = CohortDataLoader.from_ppkt_store(['TGFBR1','TGFBR2','SMAD3','TGFB2','TGFB3','SMAD2'])
matrix_gen = PhenopacketMatrixGenerator(phenopackets)
hpo_matrix, target_matrix = HPOMatrixProcessor.filter_hpo_matrix(matrix_gen, threshold=0.5,nan_strategy='drop')

