# Thalassemia (HBB)

This notebook shows how to develop a classifier with embedded tests in Jupyter using thalassemia-associated ClinVar variants.


In [1]:
# !uv pip install -e ../../python

In [2]:
import pandas as pd
from bioscript import optional_int, optional_str, write_tsv
from bioscript.classifier import GenotypeClassifier
from bioscript.types import VariantCall
from bioscript import assets_dir

In [3]:
ASSETS_DIR = assets_dir()
CLINVAR_TSV = 'thalassemia_clinvar.tsv'
RESULT_HEADERS = [
    'participant_id',
    'filename',
    'gene',
    'rsid',
    'chromosome',
    'position',
    'genotype',
    'ref',
    'alt',
    'variant_type',
    'match_type',
]


In [4]:
def generate_variant_calls(df: pd.DataFrame) -> list[VariantCall]:
    """Generate VariantCall objects from ClinVar DataFrame."""
    vcs: list[VariantCall] = []
    for _, row in df.iterrows():
        vcs.append(
            VariantCall(
                rsid=optional_str(row["rsid"]),
                ref=optional_str(row["ref"]),
                alt=optional_str(row["alt"]),
                chromosome=optional_str(row["chromosome"]),
                position=optional_int(row["position"]),
                gene=optional_str(row.get("gene"), upper=True),
            )
        )
    return vcs

In [5]:
def get_vcs() -> list[VariantCall]:
    """Load thalassemia-associated variant calls from a ClinVar TSV file."""
    df = pd.read_csv(ASSETS_DIR / CLINVAR_TSV, sep='	')
    print(f'Loaded {len(df)} variants from {CLINVAR_TSV}')
    return generate_variant_calls(df)


In [6]:
class ThalassemiaClassifier(GenotypeClassifier):
    def classify(self, matches):
        """Classify thalassemia-associated variants and write results to TSV files."""
        if not matches.all_matches:
            print('No variant matches were found.', flush=True)

        # Get categorized matches as report rows
        ref_rows, var_rows, no_rows = matches.categorize_report_rows(
            self.participant_id, self.filename
        )

        if self.debug:
            write_tsv(f'{self.output_basename}_ref.tsv', ref_rows)
            write_tsv(f'{self.output_basename}_no.tsv', no_rows)

        write_tsv(f'{self.output_basename}.tsv', var_rows, headers=RESULT_HEADERS)

        # Return variant rows for testing
        return var_rows


In [7]:
__bioscript__ = {
    'variant_calls': get_vcs,
    'classifier': ThalassemiaClassifier,
    'name': 'THALASSEMIA',
}


## Tests

Write tests using the test_* function convention:

In [8]:
from bioscript import VariantFixture
from bioscript.types import MatchList
import os

# Create test fixtures for thalassemia-associated HBB variants (subset from thalassemia_clinvar.tsv)
fixture = VariantFixture(
    [
        {'rsid': 'rs33985472', 'chromosome': '11', 'position': 5225485},
        {'rsid': 'rs63751128', 'chromosome': '11', 'position': 5225487},
        {'rsid': 'rs33978907', 'chromosome': '11', 'position': 5225488},
        {'rsid': 'rs34809925', 'chromosome': '11', 'position': 5225592},
        {'rsid': 'rs35117167', 'chromosome': '11', 'position': 5225605},
        {'rsid': 'rs33971634', 'chromosome': '11', 'position': 5225660},
    ],
    assembly='GRCh38',
)


In [9]:
def test_thalassemia_heterozygous_variants():
    """Test detection of heterozygous thalassemia-associated variants."""
    variants = fixture(['TC', 'TC', 'AG', 'GG', 'TT', 'GG'])

    # Create mini variant call list for testing
    test_vcs = [
        VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'),
        VariantCall(rsid='rs63751128', ref='T', alt='C', chromosome='11', position=5225487, gene='HBB'),
        VariantCall(rsid='rs33978907', ref='A', alt='G', chromosome='11', position=5225488, gene='HBB'),
    ]

    matches = MatchList(variant_calls=test_vcs).match_rows(variants)
    classifier = ThalassemiaClassifier(participant_id='TEST_HET', name='THALASSEMIA', filename='test.txt')
    result = classifier(matches)

    assert len(result) == 3, f'Expected 3 variant rows, got {len(result)}'
    assert all(row['gene'] == 'HBB' for row in result), 'All variants should be HBB'
    assert all(row['match_type'] == 'VARIANT_CALL' for row in result), 'All should be variant calls'

    # Cleanup output file
    os.remove('result_THALASSEMIA_TEST_HET.tsv')


In [10]:
def test_thalassemia_homozygous_variant():
    """Test detection of a homozygous thalassemia-associated variant."""
    variants = fixture(['TT', 'TT', 'AA', 'CC', 'TT', 'GG'])

    test_vcs = [
        VariantCall(rsid='rs34809925', ref='G', alt='C', chromosome='11', position=5225592, gene='HBB'),
    ]

    matches = MatchList(variant_calls=test_vcs).match_rows(variants)
    classifier = ThalassemiaClassifier(participant_id='TEST_HOM', name='THALASSEMIA', filename='test.txt')
    result = classifier(matches)

    assert len(result) == 1, f'Expected 1 variant row, got {len(result)}'
    assert result[0]['gene'] == 'HBB', 'Variant should be HBB'
    assert result[0]['genotype'] == 'CC', 'Should be homozygous CC'

    # Cleanup output file
    os.remove('result_THALASSEMIA_TEST_HOM.tsv')


In [11]:
def test_no_variants():
    """Test classifier with no matching variants."""
    variants = fixture(['TT', 'TT', 'AA', 'GG', 'TT', 'GG'])

    test_vcs = [
        VariantCall(rsid='rs33985472', ref='T', alt='C', chromosome='11', position=5225485, gene='HBB'),
    ]

    matches = MatchList(variant_calls=test_vcs).match_rows(variants)
    classifier = ThalassemiaClassifier(participant_id='TEST_REF', name='THALASSEMIA', filename='test.txt')
    result = classifier(matches)

    assert len(result) == 0, f'Expected 0 variant rows, got {len(result)}'

    # Cleanup output file
    os.remove('result_THALASSEMIA_TEST_REF.tsv')


## Run Tests in Jupyter

You can run tests directly in the notebook:

In [12]:
# Run tests
test_thalassemia_heterozygous_variants()
test_thalassemia_homozygous_variant()
test_no_variants()
print('✓ All tests passed!')


✓ All tests passed!


## Export to Python Module

Export this notebook to a Python file:

```bash
bioscript export thalassemia_dev.ipynb -o classify_thalassemia.py
```

Or in Python:

```python
from bioscript import export_from_notebook
export_from_notebook('thalassemia_dev.ipynb', 'classify_thalassemia.py')
```


In [13]:
from bioscript import export_from_notebook
export_from_notebook('thalassemia_dev.ipynb', 'classify_thalassemia.py')


PosixPath('classify_thalassemia.py')

In [14]:
!bioscript test classify_thalassemia.py



Testing: classify_thalassemia.py
Running tests with pytest: classify_thalassemia.py
platform darwin -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- /Users/madhavajay/dev/bioscript/workspace2/.venv/bin/python3
cachedir: .pytest_cache
rootdir: /Users/madhavajay/dev/bioscript/workspace2/examples/thalassemia
plugins: anyio-4.12.0
collected 3 items                                                              [0m

classify_thalassemia.py::test_thalassemia_heterozygous_variants [32mPASSED[0m[32m   [ 33%][0m
classify_thalassemia.py::test_thalassemia_homozygous_variant [32mPASSED[0m[32m      [ 66%][0m
classify_thalassemia.py::test_no_variants [32mPASSED[0m[32m                         [100%][0m



In [15]:
from bioscript import export_bioscript_workflow

project = export_bioscript_workflow(
    script_path='./classify_thalassemia.py',
    workflow_name='thalassemia-classifier',
    author='madhava@openmined.org',
    target_dir='./',
    assets={
        'thalassemia_clinvar.tsv',
    },
    inputs=[
        {
            'name': 'participants',
            'type': 'List[GenotypeRecord]',
            'description': 'CSV/TSV with participant_id and genotype_file columns',
            'format': 'csv',
            'mapping': {
                'participant_id': 'participant_id',
                'genotype_file': 'genotype_file',
            },
        }
    ],
    outputs=[
        {
            'name': 'classification_result',
            'type': 'File',
            'description': 'Thalassemia variant classification (aggregated)',
            'format': 'tsv',
            'path': 'result_THALASSEMIA.tsv',
        },
    ],
    version='0.1.1',
    description='Classification of thalassemia-associated variants using ClinVar reference data.',
)
project


BioVaultProject(name='thalassemia-classifier', author='madhava@openmined.org', workflow='workflow.nf', description='Classification of thalassemia-associated variants using ClinVar reference data.', template=<TemplateType.DYNAMIC_NEXTFLOW: 'dynamic-nextflow'>, version='0.1.1', assets=['classify_thalassemia.py', 'thalassemia_clinvar.tsv'], parameters=[], inputs=[Input(name='participants', type='List[GenotypeRecord]', description='CSV/TSV with participant_id and genotype_file columns', format='csv', path=None, mapping={'participant_id': 'participant_id', 'genotype_file': 'genotype_file'}, cli_flag=None)], outputs=[Output(name='classification_result', type='File', description='Thalassemia variant classification (aggregated)', format='tsv', path='result_THALASSEMIA.tsv', cli_flag=None)], processes=[ProcessDefinition(name='thalassemia_classifier', script='classify_thalassemia.py', container='ghcr.io/openmined/bioscript:0.1.5', kind='bioscript')], docker_image='ghcr.io/openmined/bioscript:0.1

In [16]:
from bioscript import export_bioscript_pipeline, PipelineStep, SQLStore

pipeline = export_bioscript_pipeline(
    pipeline_name='thalassemia-classifier',
    target_dir='./thalassemia-classifier',
    inputs={
        'samplesheet': 'List[GenotypeRecord]',
    },
    steps=[
        PipelineStep(
            step_id='thalassemia',
            uses='./',
            with_args={
                'participants': 'inputs.samplesheet',
            },
            publish={
                'classification_result': 'File(result_THALASSEMIA.tsv)',
            },
            store={
                'counts_sql': SQLStore(
                    source='classification_result',
                    table_name='thalassemia_{run_id}',
                    destination='SQL()',
                    key_column='participant_id',
                ),
            },
        ),
    ],
    version='0.1.1',
)
pipeline


BioVaultPipeline(name='thalassemia-classifier', inputs={'samplesheet': 'List[GenotypeRecord]'}, steps=[PipelineStep(step_id='thalassemia', uses='./', with_args={'participants': 'inputs.samplesheet'}, publish={'classification_result': 'File(result_THALASSEMIA.tsv)'}, store={'counts_sql': SQLStore(source='classification_result', table_name='thalassemia_{run_id}', destination='SQL()', participant_column='participant_id', key_column='participant_id')})], version='0.1.1')