## Export as BioVault Project

Export this notebook as a complete BioVault project for pipeline execution:

In [1]:
# !uv pip install -e ../../python

In [2]:
from bioscript import write_tsv
from bioscript.classifier import GenotypeClassifier
from bioscript.types import Alleles, MatchList, MatchType, Nucleotide, VariantCall

In [3]:
# def rs12913832 = 38/37 ['15:28120472-28120472', '15:28365618-28365618']
# https://www.ncbi.nlm.nih.gov/snp/?term=rs12913832
# https://www.ncbi.nlm.nih.gov/snp/rs12913832
rs12913832 = VariantCall(rsid=["rs12913832", "rs60078917"], ref=Alleles.A, alt=Alleles.NOT_A, gene="HERC2")
# (A;A) yields brown eye color ~80% of the time.
# (A;G) also tends toward brown.
# (G;G) gives blue eye color ~99% of the time.

In [4]:
def _format_allele_label(allele):
    if isinstance(allele, Alleles):
        return ",".join(sorted(a.value for a in allele))
    if isinstance(allele, str):
        return allele
    if hasattr(allele, "__iter__"):
        return ",".join(sorted(str(a) for a in allele))
    return str(allele)

class HERC2Classifier(GenotypeClassifier):
    def classify(self, matches):
        match = matches.get(rs12913832)
        
        # Determine eye color result
        eye_color_map = {
            "AA": "Brown",
            "AG": "Brown",
            "GG": "Blue",
        }
        
        if not match or match.has_missing:
            result = "No call"
            genotype_sorted = None
            match_type = MatchType.NO_CALL.value
            ref_count = 0
            alt_count = 0
        else:
            genotype_sorted = match.genotype_sorted
            result = eye_color_map.get(genotype_sorted, "Unknown")
            match_type = match.match_type.value
            ref_count = match.ref_count
            alt_count = match.alt_count
        
        # Extract properties from match (source_row has the actual data from the file)
        if match and match.source_row:
            rsid = match.source_row.rsid
            chromosome = match.source_row.chromosome
            position = match.source_row.position
        else:
            rsid = "rs12913832"
            chromosome = None
            position = None
        
        # Create report row using match properties
        ref_label = _format_allele_label(rs12913832.ref)
        alt_label = _format_allele_label(rs12913832.alt)
        report_row = {
            "participant_id": self.participant_id,
            "filename": self.filename,
            "rsid": rsid,
            "chromosome": chromosome,
            "position": position,
            "ref": ref_label,
            "alt": alt_label,
            "genotype": genotype_sorted,
            "match_type": match_type,
            "eye_color": result,
            "ref_count": ref_count,
            "alt_count": alt_count,
        }
        
        # Write to TSV file (as a list with one row)
        write_tsv(f"{self.output_basename}.tsv", [report_row])
        
        # Return list for testing (consistent with BRCA)
        return [report_row]


In [5]:
__bioscript__ = {
    "variant_calls": [rs12913832],
    "classifier": HERC2Classifier,
    "name": "HERC2",
}

In [6]:
from bioscript import VariantFixture

# Use the regular VariantFixture which now includes raw_line functionality
fixture = VariantFixture(
    [
        {"rsid": "rs12913832", "chromosome": "15", "position": 28120472}
    ],
    assembly="GRCh38",
)

In [7]:
import os

def classify_fixture(genotype):
    variants = fixture([genotype])
    matches = MatchList([rs12913832]).match_rows(variants)
    classifier = HERC2Classifier(participant_id="TEST_ID", name="HERC2", filename="test.txt")
    return classifier(matches)

def test_brown_homozygous():
    result = classify_fixture("AA")
    assert len(result) == 1
    assert result[0]["eye_color"] == "Brown"
    assert result[0]["genotype"] == "AA"
    assert result[0]["match_type"] == MatchType.REFERENCE_CALL.value
    assert result[0]["participant_id"] == "TEST_ID"
    assert result[0]["filename"] == "test.txt"
    assert result[0]["rsid"] == "rs12913832"
    assert result[0]["chromosome"] == "15"
    assert result[0]["position"] == 28120472
    assert result[0]["ref_count"] == 2
    assert result[0]["alt_count"] == 0
    # Cleanup
    os.remove("result_HERC2_TEST_ID.tsv")

def test_brown_heterozygous_unsorted():
    result = classify_fixture("GA")
    assert len(result) == 1
    assert result[0]["eye_color"] == "Brown"
    assert result[0]["genotype"] == "AG"
    assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
    assert result[0]["participant_id"] == "TEST_ID"
    assert result[0]["filename"] == "test.txt"
    assert result[0]["ref_count"] == 1
    assert result[0]["alt_count"] == 1
    # Cleanup
    os.remove("result_HERC2_TEST_ID.tsv")

def test_blue_homozygous():
    result = classify_fixture("GG")
    assert len(result) == 1
    assert result[0]["eye_color"] == "Blue"
    assert result[0]["genotype"] == "GG"
    assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
    assert result[0]["participant_id"] == "TEST_ID"
    assert result[0]["filename"] == "test.txt"
    assert result[0]["ref_count"] == 0
    assert result[0]["alt_count"] == 2
    # Cleanup
    os.remove("result_HERC2_TEST_ID.tsv")

def test_no_call():
    result = classify_fixture("--")
    assert len(result) == 1
    assert result[0]["eye_color"] == "No call"
    assert result[0]["genotype"] is None
    assert result[0]["match_type"] == MatchType.NO_CALL.value
    assert result[0]["participant_id"] == "TEST_ID"
    assert result[0]["filename"] == "test.txt"
    assert result[0]["ref_count"] == 0
    assert result[0]["alt_count"] == 0
    # Cleanup
    os.remove("result_HERC2_TEST_ID.tsv")

def test_unexpected_allele_c():
    'Test handling of unexpected C allele (not in reference map).'
    result = classify_fixture("AC")
    assert len(result) == 1
    assert result[0]["eye_color"] == "Unknown"
    assert result[0]["genotype"] == "AC"
    assert result[0]["match_type"] == MatchType.VARIANT_CALL.value
    assert result[0]["participant_id"] == "TEST_ID"
    assert result[0]["filename"] == "test.txt"
    assert result[0]["ref_count"] == 1
    assert result[0]["alt_count"] == 1
    # Cleanup
    os.remove("result_HERC2_TEST_ID.tsv")


In [8]:
# Run tests
test_brown_homozygous()
test_brown_heterozygous_unsorted()
test_blue_homozygous()
test_no_call()
test_unexpected_allele_c()
print("✓ All tests passed!")


✓ All tests passed!


In [9]:
from bioscript import export_from_notebook
export_from_notebook("herc2_dev.ipynb", "classify_herc2.py")

!bioscript test classify_herc2.py


Testing: classify_herc2.py
Running tests with pytest: classify_herc2.py
platform darwin -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/.venv/bin/python3
cachedir: .pytest_cache
rootdir: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2
plugins: anyio-4.12.1
collected 5 items                                                              [0m

classify_herc2.py::test_brown_homozygous [32mPASSED[0m[32m                          [ 20%][0m
classify_herc2.py::test_brown_heterozygous_unsorted [32mPASSED[0m[32m               [ 40%][0m
classify_herc2.py::test_blue_homozygous [32mPASSED[0m[32m                           [ 60%][0m
classify_herc2.py::test_no_call [32mPASSED[0m[32m                                   [ 80%][0m
classify_herc2.py::test_unexpected_allele_c [32mPASSED[0m[32m                       [100%][0m



In [10]:
# Rewritten to write the raw line to a file and use bioscript classify in the next cell
from pathlib import Path

def test_brown_heterozygous_file_based():
    # Write the raw_line to a test file
    test_file = Path("herc2_test_ga.tsv")
    test_file.write_text(
        "# rsid\tchromosome\tposition\tgenotype\n" +
        "rs12913832\t15\t28120472\tGA"
    )
    
    print(f"Test file created: {test_file}")
    print(f"Content:\n{test_file.read_text()}")
    return test_file

# Run the test
test_file = test_brown_heterozygous_file_based()
print("✓ Test passed and file created!")

Test file created: herc2_test_ga.tsv
Content:
# rsid	chromosome	position	genotype
rs12913832	15	28120472	GA
✓ Test passed and file created!


In [11]:
!bioscript classify classify_herc2.py --file herc2_test_ga.tsv --participant_id="X"
!cat result_HERC2_X.tsv

[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2
[bioscript] Provided SNP file argument: herc2_test_ga.tsv
[bioscript] Provided path absolute? False
[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2/herc2_test_ga.tsv
[bioscript] Resolved exists? True
[bioscript] CWD contents: .ipynb_checkpoints, .pytest_cache, __pycache__, classify_herc2.py, herc2-classifier, herc2_dev.ipynb, herc2_test_ga.tsv, result_HERC2_X.tsv, result_HERC2_test_user.tsv
[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2/herc2_test_ga.tsv
participant_id=X
HERC2_count=1
participant_id	filename	rsid	chromosome	position	ref	alt	genotype	match_type	eye_color	ref_count	alt_count
X	herc2_test_ga.tsv	rs12913832	15	28120472	A	C,G,T,U	AG	Variant call	Brown	1	1


In [12]:
# Test JSON output to show the new consistent naming convention
!bioscript classify classify_herc2.py --file herc2_test_ga.tsv --out json --participant_id="test_user"

[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2
[bioscript] Provided SNP file argument: herc2_test_ga.tsv
[bioscript] Provided path absolute? False
[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2/herc2_test_ga.tsv
[bioscript] Resolved exists? True
[bioscript] CWD contents: .ipynb_checkpoints, .pytest_cache, __pycache__, classify_herc2.py, herc2-classifier, herc2_dev.ipynb, herc2_test_ga.tsv, result_HERC2_X.tsv, result_HERC2_test_user.tsv
[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/herc2/herc2_test_ga.tsv
{
  "participant_id": "test_user",
  "HERC2_count": 1,
  "HERC2_data": [
    {
      "participant_id": "test_user",
      "filename": "herc2_test_ga.tsv",
      "rsid": "rs12913832",
      "chromosome": "15",
      "position": 28120472,
      "ref": "A",
      "alt": "C,G,T,U",
      "genotype": "AG",
 

In [13]:
from pathlib import Path
from bioscript import export_bioscript_workflow

# Export with List[GenotypeRecord] for multi-participant processing
project = export_bioscript_workflow(
    script_path='./classify_herc2.py',
    workflow_name='herc2-classifier',
    author='madhava@openmined.org',
    target_dir="./",
    assets=['herc2-classifier/assets/aggregate_population_stats.py'],
    inputs=[
        {
            'name': 'participants',
            'type': 'List[GenotypeRecord]',
            'description': 'CSV/TSV with participant_id and genotype_file columns',
            'format': 'csv',
            'mapping': {
                'participant_id': 'participant_id',
                'genotype_file': 'genotype_file',
            }
        }
    ],
    outputs=[
        {
            'name': 'classification_result',
            'type': 'File',
            'description': 'HERC2 eye color classification (aggregated)',
            'format': 'tsv',
            'path': 'result_HERC2.tsv',
        },
        {
            'name': 'population_stats',
            'type': 'File',
            'description': 'HERC2 population allele statistics (aggregated)',
            'format': 'tsv',
            'path': 'result_HERC2_stats.tsv',
        },
    ],
    version="0.1.1",
    description="Classification of HERC2 genotypes for eye color prediction.",
)
project


BioVaultProject(name='herc2-classifier', author='madhava@openmined.org', workflow='workflow.nf', description='Classification of HERC2 genotypes for eye color prediction.', template=<TemplateType.DYNAMIC_NEXTFLOW: 'dynamic-nextflow'>, version='0.1.1', assets=['classify_herc2.py', 'aggregate_population_stats.py'], parameters=[], inputs=[Input(name='participants', type='List[GenotypeRecord]', description='CSV/TSV with participant_id and genotype_file columns', format='csv', path=None, mapping={'participant_id': 'participant_id', 'genotype_file': 'genotype_file'}, cli_flag=None)], outputs=[Output(name='classification_result', type='File', description='HERC2 eye color classification (aggregated)', format='tsv', path='result_HERC2.tsv', cli_flag=None), Output(name='population_stats', type='File', description='HERC2 population allele statistics (aggregated)', format='tsv', path='result_HERC2_stats.tsv', cli_flag=None)], processes=[ProcessDefinition(name='herc2_classifier', script='classify_he

In [14]:
from pathlib import Path

workflow_path = Path('herc2-classifier/workflow.nf')
if not workflow_path.exists():
    raise FileNotFoundError('workflow.nf not found. Run the export cell first.')

text = workflow_path.read_text(encoding='utf-8')

if 'aggregate_population_stats' not in text:
    text = text.replace(
        "        def aggregated = aggregate_results(\n            per_participant_results.collect()\n        )\n\n    emit:\n        classification_result = aggregated\n}\n",
        "        def aggregated = aggregate_results(\n            per_participant_results.collect()\n        )\n\n        // Aggregate population statistics\n        def population_stats = aggregate_population_stats(\n            tuple(assetsDirPath, aggregated)\n        )\n\n    emit:\n        classification_result = aggregated\n        population_stats = population_stats\n}\n",
    )

process_block = '''
process aggregate_population_stats {
    container 'ghcr.io/openmined/bioscript:0.1.6'
    publishDir params.results_dir, mode: 'copy', overwrite: true

    input:
        tuple path(assets_dir), path(aggregated_results)

    output:
        path "result_HERC2_stats.tsv"

    script:
    """
    python3 "${assets_dir}/aggregate_population_stats.py" \
      --input "${aggregated_results}" \
      --output result_HERC2_stats.tsv
    """
}
'''

if 'process aggregate_population_stats' not in text:
    text += process_block

workflow_path.write_text(text, encoding='utf-8')


2818

In [15]:
from bioscript import export_bioscript_pipeline, PipelineStep, SQLStore

pipeline = export_bioscript_pipeline(
    pipeline_name='herc2-classifier',
    target_dir='./herc2-classifier',
    inputs={
        'samplesheet': 'List[GenotypeRecord]',
    },
    steps=[
        PipelineStep(
            step_id='herc2',
            uses='./',
            with_args={
                'participants': 'inputs.samplesheet',
            },
            publish={
                'classification_result': 'File(result_HERC2.tsv)',
                'population_stats': 'File(result_HERC2_stats.tsv)',
            },
            store={
                'counts_sql': SQLStore(
                    source='classification_result',
                    table_name='herc2_{run_id}',
                    destination='SQL()',
                    key_column='participant_id',
                ),
            },
        ),
    ],
    version="0.1.1",
)
pipeline


BioVaultPipeline(name='herc2-classifier', inputs={'samplesheet': 'List[GenotypeRecord]'}, steps=[PipelineStep(step_id='herc2', uses='./', with_args={'participants': 'inputs.samplesheet'}, publish={'classification_result': 'File(result_HERC2.tsv)', 'population_stats': 'File(result_HERC2_stats.tsv)'}, store={'counts_sql': SQLStore(source='classification_result', table_name='herc2_{run_id}', destination='SQL()', participant_column='participant_id', key_column='participant_id')})], version='0.1.1')