# APOL1 Classifier Development

This notebook shows how to develop a classifier with embedded tests in Jupyter.

In [1]:
from bioscript.classifier import DiploidResult, GenotypeClassifier, GenotypeEnum
from bioscript.types import Alleles, VariantCall

In [2]:
# Define APOL1 variant calls
rs73885319 = VariantCall(rsid="rs73885319", ref=Alleles.A, alt=Alleles.NOT_A)
rs60910145 = VariantCall(rsid="rs60910145", ref=Alleles.T, alt=Alleles.NOT_T)
rs71785313 = VariantCall(
    rsid=["rs71785313", "rs1317778148", "rs143830837"], ref=Alleles.I, alt=Alleles.D
)

In [3]:
class APOL1Genotypes(GenotypeEnum):
    G2 = "G2"
    G1 = "G1"
    G0 = "G0"

MISSING = "G-"

In [4]:
from bioscript import write_tsv
from bioscript.types import MatchType

def _format_allele_label(allele):
    if isinstance(allele, Alleles):
        return ",".join(sorted(a.value for a in allele))
    if isinstance(allele, str):
        return allele
    if hasattr(allele, "__iter__"):
        return ",".join(sorted(str(a) for a in allele))
    return str(allele)

class APOL1Classifier(GenotypeClassifier):
    def classify(self, matches) -> list[dict[str, object]]:
        g2_match = matches.get(rs71785313)
        site1_match = matches.get(rs73885319)
        site2_match = matches.get(rs60910145)

        variant_matches = [
            ("rs71785313", rs71785313, g2_match),
            ("rs73885319", rs73885319, site1_match),
            ("rs60910145", rs60910145, site2_match),
        ]

        if not any(match is not None for _, _, match in variant_matches):
            diploid_result = DiploidResult(MISSING, MISSING)
        else:
            d_count = g2_match.alt_count if g2_match else 0
            site1_variants = site1_match.alt_count if site1_match else 0
            site2_variants = site2_match.alt_count if site2_match else 0

            has_g1 = site1_variants > 0 and site2_variants > 0
            g1_total = site1_variants + site2_variants if has_g1 else 0

            if d_count == 2:
                diploid_result = DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G2)
            elif d_count == 1:
                if g1_total >= 2:
                    diploid_result = DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G1)
                else:
                    diploid_result = DiploidResult(APOL1Genotypes.G2, APOL1Genotypes.G0)
            else:
                if g1_total == 4:
                    diploid_result = DiploidResult(APOL1Genotypes.G1, APOL1Genotypes.G1)
                elif g1_total >= 2:
                    diploid_result = DiploidResult(APOL1Genotypes.G1, APOL1Genotypes.G0)
                else:
                    diploid_result = DiploidResult(APOL1Genotypes.G0, APOL1Genotypes.G0)

        apol1_status = str(diploid_result.sorted())

        report_rows = []
        for fallback_rsid, variant_call, match in variant_matches:
            if match and match.source_row:
                rsid = match.source_row.rsid
                chromosome = match.source_row.chromosome
                position = match.source_row.position
            else:
                aliases = getattr(getattr(variant_call, "rsid", None), "aliases", None)
                rsid = sorted(aliases)[0] if aliases else fallback_rsid
                chromosome = getattr(variant_call, "chromosome", None)
                position = getattr(variant_call, "position", None)

            if match:
                genotype = match.genotype_sorted
                raw_match_type = match.match_type.value
                if match.has_missing or raw_match_type == MatchType.NO_CALL.value:
                    match_type = MatchType.NO_CALL.value
                    ref_count = 0
                    alt_count = 0
                else:
                    match_type = raw_match_type
                    ref_count = match.ref_count
                    alt_count = match.alt_count
            else:
                genotype = None
                match_type = MatchType.NO_CALL.value
                ref_count = 0
                alt_count = 0

            ref_label = _format_allele_label(variant_call.ref)
            alt_label = _format_allele_label(variant_call.alt)

            report_rows.append(
                {
                    "participant_id": self.participant_id,
                    "filename": self.filename,
                    "rsid": rsid,
                    "chromosome": chromosome,
                    "position": position,
                    "ref": ref_label,
                    "alt": alt_label,
                    "genotype": genotype,
                    "match_type": match_type,
                    "ref_count": ref_count,
                    "alt_count": alt_count,
                    "apol1_status": apol1_status,
                }
            )

        write_tsv(f"{self.output_basename}.tsv", report_rows)
        return report_rows

In [5]:
__bioscript__ = {
    "variant_calls": [rs73885319, rs60910145, rs71785313],
    "classifier": APOL1Classifier,
    "name": "APOL1",
}

## Tests

Write tests using the test_* function convention:

In [6]:
from bioscript import VariantFixture
from bioscript.types import MatchList

fixture = VariantFixture(
    [
        {"rsid": "rs73885319", "chromosome": "22", "position": 36265860},
        {"rsid": "rs60910145", "chromosome": "22", "position": 36265988},
        {"rsid": "rs71785313", "chromosome": "22", "position": 36266000},
    ],
    assembly="GRCh38",
)

In [7]:
import os
from bioscript.types import MatchType

OUTPUT_FILE = "result_APOL1_TEST_ID.tsv"

def cleanup_output():
    if os.path.exists(OUTPUT_FILE):
        os.remove(OUTPUT_FILE)

def classify_fixture(genotypes):
    cleanup_output()
    variants = fixture(genotypes)
    matches = MatchList([rs73885319, rs60910145, rs71785313]).match_rows(variants)
    classifier = APOL1Classifier(participant_id="TEST_ID", name="APOL1", filename="test.txt")
    result = classifier(matches)
    assert isinstance(result, list)
    assert len(result) == 3
    rows = {row["rsid"]: row for row in result}
    assert set(rows.keys()) == {"rs73885319", "rs60910145", "rs71785313"}
    for row in rows.values():
        assert row["participant_id"] == "TEST_ID"
        assert row["filename"] == "test.txt"
        assert "ref" in row
        assert "alt" in row
        assert "ref_count" in row
        assert "alt_count" in row
    return rows

def test_g0_homozygous():
    rows = classify_fixture(["AA", "TT", "II"])
    assert rows["rs71785313"]["apol1_status"] == "G0/G0"
    assert rows["rs71785313"]["genotype"] == "II"
    assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value
    assert rows["rs71785313"]["ref_count"] == 0
    assert rows["rs71785313"]["alt_count"] == 0

    assert rows["rs73885319"]["apol1_status"] == "G0/G0"
    assert rows["rs73885319"]["genotype"] == "AA"
    assert rows["rs73885319"]["match_type"] == MatchType.REFERENCE_CALL.value
    assert rows["rs73885319"]["ref_count"] == 2
    assert rows["rs73885319"]["alt_count"] == 0

    assert rows["rs60910145"]["apol1_status"] == "G0/G0"
    assert rows["rs60910145"]["genotype"] == "TT"
    assert rows["rs60910145"]["match_type"] == MatchType.REFERENCE_CALL.value
    assert rows["rs60910145"]["ref_count"] == 2
    assert rows["rs60910145"]["alt_count"] == 0

    cleanup_output()

def test_all_apol1_status_combinations():
    cases = [
        ("G0/G0", ["AA", "TT", "II"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "II"}),
        ("G1/G0", ["AG", "TC", "II"], {"rs73885319": "AG", "rs60910145": "CT", "rs71785313": "II"}),
        ("G1/G1", ["GG", "CC", "II"], {"rs73885319": "GG", "rs60910145": "CC", "rs71785313": "II"}),
        ("G2/G0", ["AA", "TT", "ID"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "DI"}),
        ("G2/G1", ["AG", "TC", "ID"], {"rs73885319": "AG", "rs60910145": "CT", "rs71785313": "DI"}),
        ("G2/G2", ["AA", "TT", "DD"], {"rs73885319": "AA", "rs60910145": "TT", "rs71785313": "DD"}),
    ]

    for expected_status, genotypes, expected_genotypes in cases:
        rows = classify_fixture(genotypes)

        assert set(rows.keys()) == {"rs73885319", "rs60910145", "rs71785313"}
        assert all(row["apol1_status"] == expected_status for row in rows.values())

        for rsid, expected_genotype in expected_genotypes.items():
            assert rows[rsid]["genotype"] == expected_genotype

        # rs71785313 is present in every test case and currently emitted as No call by the matcher.
        assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value
        assert rows["rs71785313"]["ref_count"] == 0
        assert rows["rs71785313"]["alt_count"] == 0

        # rs73885319 and rs60910145 must still be callable and typed correctly.
        assert rows["rs73885319"]["match_type"] in {
            MatchType.REFERENCE_CALL.value,
            MatchType.VARIANT_CALL.value,
        }
        assert rows["rs60910145"]["match_type"] in {
            MatchType.REFERENCE_CALL.value,
            MatchType.VARIANT_CALL.value,
        }

    cleanup_output()

In [8]:
def test_g1_homozygous():
    rows = classify_fixture(["GG", "CC", "II"])
    assert all(row["apol1_status"] == "G1/G1" for row in rows.values())

    assert rows["rs73885319"]["genotype"] == "GG"
    assert rows["rs73885319"]["match_type"] == MatchType.VARIANT_CALL.value
    assert rows["rs73885319"]["ref_count"] == 0
    assert rows["rs73885319"]["alt_count"] == 2

    assert rows["rs60910145"]["genotype"] == "CC"
    assert rows["rs60910145"]["match_type"] == MatchType.VARIANT_CALL.value
    assert rows["rs60910145"]["ref_count"] == 0
    assert rows["rs60910145"]["alt_count"] == 2

    assert rows["rs71785313"]["genotype"] == "II"
    assert rows["rs71785313"]["match_type"] == MatchType.NO_CALL.value
    assert rows["rs71785313"]["ref_count"] == 0
    assert rows["rs71785313"]["alt_count"] == 0

    cleanup_output()

## Run Tests in Jupyter

You can run tests directly in the notebook:

In [9]:
# Run tests
test_g0_homozygous()
test_g1_homozygous()
test_all_apol1_status_combinations()
print("✓ All tests passed!")

✓ All tests passed!


In [10]:
from bioscript import export_from_notebook
export_from_notebook("apol1_dev.ipynb", "classify_apol1.py")

PosixPath('classify_apol1.py')

In [11]:
!bioscript test classify_apol1.py


Testing: classify_apol1.py
Running tests with pytest: classify_apol1.py
platform darwin -- Python 3.13.5, pytest-9.0.2, pluggy-1.6.0 -- /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/.venv/bin/python3
cachedir: .pytest_cache
rootdir: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1
plugins: anyio-4.12.1
collected 3 items                                                              [0m

classify_apol1.py::test_g0_homozygous [32mPASSED[0m[32m                             [ 33%][0m
classify_apol1.py::test_all_apol1_status_combinations [32mPASSED[0m[32m             [ 66%][0m
classify_apol1.py::test_g1_homozygous [32mPASSED[0m[32m                             [100%][0m



In [12]:
from bioscript import export_bioscript_workflow

project = export_bioscript_workflow(
    script_path='./classify_apol1.py',
    workflow_name='apol1-classifier',
    author='madhava@openmined.org',
    target_dir='./',
    assets=[
        'apol1-classifier/assets/aggregate_population_stats.py',
        'apol1-classifier/assets/aggregate_classification_stats.py',
        'apol1-classifier/assets/aggregate_apol1_status.py',
    ],
    inputs=[
        {
            'name': 'participants',
            'type': 'List[GenotypeRecord]',
            'description': 'CSV/TSV with participant_id and genotype_file columns',
            'format': 'csv',
            'mapping': {
                'participant_id': 'participant_id',
                'genotype_file': 'genotype_file',
            }
        }
    ],
    outputs=[
        {
            'name': 'classification_result',
            'type': 'File',
            'description': 'APOL1 genotype classification',
            'format': 'tsv',
            'path': 'result_APOL1.tsv',
        },
        {
            'name': 'population_stats',
            'type': 'File',
            'description': 'APOL1 population allele statistics (aggregated)',
            'format': 'tsv',
            'path': 'result_APOL1_stats.tsv',
        },
        {
            'name': 'classification_stats',
            'type': 'File',
            'description': 'APOL1 status counts by allele class (G0/G1/G2) with hetero/homo split',
            'format': 'tsv',
            'path': 'result_APOL1_classification_stats.tsv',
        },
        {
            'name': 'apol1_status',
            'type': 'File',
            'description': 'Per-participant APOL1 status summary',
            'format': 'tsv',
            'path': 'result_APOL1_status.tsv',
        },
    ],
    version="0.1.1",
    description="Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment.",
)
project

BioVaultProject(name='apol1-classifier', author='madhava@openmined.org', workflow='workflow.nf', description='Classification of APOL1 genotypes (G0, G1, G2) for kidney disease risk assessment.', template=<TemplateType.DYNAMIC_NEXTFLOW: 'dynamic-nextflow'>, version='0.1.1', assets=['classify_apol1.py', 'aggregate_population_stats.py', 'aggregate_classification_stats.py', 'aggregate_apol1_status.py'], parameters=[], inputs=[Input(name='participants', type='List[GenotypeRecord]', description='CSV/TSV with participant_id and genotype_file columns', format='csv', path=None, mapping={'participant_id': 'participant_id', 'genotype_file': 'genotype_file'}, cli_flag=None)], outputs=[Output(name='classification_result', type='File', description='APOL1 genotype classification', format='tsv', path='result_APOL1.tsv', cli_flag=None), Output(name='population_stats', type='File', description='APOL1 population allele statistics (aggregated)', format='tsv', path='result_APOL1_stats.tsv', cli_flag=None),

In [13]:
from pathlib import Path

workflow_path = Path('apol1-classifier/workflow.nf')
if not workflow_path.exists():
    raise FileNotFoundError('workflow.nf not found. Run the export cell first.')

text = workflow_path.read_text(encoding='utf-8')

if 'aggregate_population_stats' not in text:
    text = text.replace(
        "        def aggregated = aggregate_results(\n            per_participant_results.collect()\n        )\n\n    emit:\n        classification_result = aggregated\n}",
        "        def aggregated = aggregate_results(\n            per_participant_results.collect()\n        )\n\n        // Aggregate population statistics\n        def population_stats_ch = aggregate_population_stats(\n            Channel.value(assetsDirPath),\n            aggregated\n        )\n\n        // Aggregate APOL1 status statistics (G0/G1/G2 with hetero/homo counts)\n        def classification_stats_ch = aggregate_classification_stats(\n            Channel.value(assetsDirPath),\n            aggregated\n        )\n\n        // Emit per-participant APOL1 status summary.\n        def apol1_status_ch = aggregate_apol1_status(\n            Channel.value(assetsDirPath),\n            aggregated\n        )\n\n    emit:\n        classification_result = aggregated\n        population_stats = population_stats_ch\n        classification_stats = classification_stats_ch\n        apol1_status = apol1_status_ch\n}",
    )

population_stats_block = '''
process aggregate_population_stats {
    container 'ghcr.io/openmined/bioscript:0.1.6'
    publishDir params.results_dir, mode: 'copy', overwrite: true

    input:
        path assets_dir
        path aggregated_results

    output:
        path "result_APOL1_stats.tsv"

    script:
    """
    python3 "${assets_dir}/aggregate_population_stats.py" \\
      --input "${aggregated_results}" \\
      --output result_APOL1_stats.tsv
    """
}
'''

classification_stats_block = '''
process aggregate_classification_stats {
    container 'ghcr.io/openmined/bioscript:0.1.6'
    publishDir params.results_dir, mode: 'copy', overwrite: true

    input:
        path assets_dir
        path aggregated_results

    output:
        path "result_APOL1_classification_stats.tsv"

    script:
    """
    python3 "${assets_dir}/aggregate_classification_stats.py" \\
      --input "${aggregated_results}" \\
      --output result_APOL1_classification_stats.tsv
    """
}
'''

apol1_status_block = '''
process aggregate_apol1_status {
    container 'ghcr.io/openmined/bioscript:0.1.6'
    publishDir params.results_dir, mode: 'copy', overwrite: true

    input:
        path assets_dir
        path aggregated_results

    output:
        path "result_APOL1_status.tsv"

    script:
    """
    python3 "${assets_dir}/aggregate_apol1_status.py" \\
      --input "${aggregated_results}" \\
      --output result_APOL1_status.tsv
    """
}
'''

if 'process aggregate_population_stats' not in text:
    text += population_stats_block

if 'process aggregate_classification_stats' not in text:
    text += classification_stats_block

if 'process aggregate_apol1_status' not in text:
    text += apol1_status_block

workflow_path.write_text(text, encoding='utf-8')

4227

In [14]:
from bioscript import export_bioscript_pipeline, PipelineStep, SQLStore

pipeline = export_bioscript_pipeline(
    pipeline_name='apol1-classifier',
    target_dir='./apol1-classifier',
    inputs={
        'samplesheet': 'List[GenotypeRecord]',
    },
    steps=[
        PipelineStep(
            step_id='apol1',
            uses='./',
            with_args={
                'participants': 'inputs.samplesheet',
            },
            publish={
                'classification_result': 'File(result_APOL1.tsv)',
                'population_stats': 'File(result_APOL1_stats.tsv)',
                'classification_stats': 'File(result_APOL1_classification_stats.tsv)',
                'apol1_status': 'File(result_APOL1_status.tsv)',
            },
            store={
                'counts_sql': SQLStore(
                    source='classification_result',
                    table_name='apol1_{run_id}',
                    destination='SQL()',
                    key_column='participant_id',
                ),
            },
        ),
    ],
    version="0.1.1",
)
pipeline

BioVaultPipeline(name='apol1-classifier', inputs={'samplesheet': 'List[GenotypeRecord]'}, steps=[PipelineStep(step_id='apol1', uses='./', with_args={'participants': 'inputs.samplesheet'}, publish={'classification_result': 'File(result_APOL1.tsv)', 'population_stats': 'File(result_APOL1_stats.tsv)', 'classification_stats': 'File(result_APOL1_classification_stats.tsv)', 'apol1_status': 'File(result_APOL1_status.tsv)'}, store={'counts_sql': SQLStore(source='classification_result', table_name='apol1_{run_id}', destination='SQL()', participant_column='participant_id', key_column='participant_id')})], version='0.1.1')

In [15]:
# Write a minimal APOL1 genotype TSV and inspect the classifier output
from pathlib import Path

def create_apol1_test_file():
    test_file = Path('apol1_test_g1.tsv')
    test_file.write_text(
        '# rsid\tchromosome\tposition\tgenotype\n'
        'rs73885319\t22\t36265860\tGG\n'
        'rs60910145\t22\t36265988\tCC\n'
        'rs71785313\t22\t36266000\tII\n'
    )
    print(f'Test file created: {test_file}')
    print(f'Content:\n{test_file.read_text()}')
    return test_file

test_file = create_apol1_test_file()
print('✓ Test file ready!')


Test file created: apol1_test_g1.tsv
Content:
# rsid	chromosome	position	genotype
rs73885319	22	36265860	GG
rs60910145	22	36265988	CC
rs71785313	22	36266000	II

✓ Test file ready!


In [16]:
!bioscript classify classify_apol1.py --file apol1_test_g1.tsv --participant_id="TEST_APOL1"
!cat result_APOL1_TEST_APOL1.tsv

[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1
[bioscript] Provided SNP file argument: apol1_test_g1.tsv
[bioscript] Provided path absolute? False
[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_test_g1.tsv
[bioscript] Resolved exists? True
[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, README.md, __pycache__, apol1-classifier, apol1_decodeme.csv, apol1_dev.ipynb, apol1_headerless.txt, apol1_myheritage.csv, apol1_test_g1.tsv, classify_apol1.py, genotype_files, process_samplesheet.sh, result_APOL1_DECODE.tsv, result_APOL1_HEADERLESS.tsv, result_APOL1_MYHERITAGE.tsv, result_APOL1_TEST_APOL1.tsv, results.tsv, samplesheet.csv, test_snps.txt, test_snps_p002.txt, test_snps_p003.txt
[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_test_g1.tsv
participant_id=TEST_APOL1
APO

In [17]:
# Example deCODEme-style genotype CSV
from pathlib import Path

def create_apol1_decodeme_file():
    csv_path = Path('apol1_decodeme.csv')
    csv_path.write_text(
        '\n'.join([
            'Name,Variation,Chromosome,Position',
            'rs4477212,A/G,1,72017',
            'rs2185539,C/T,1,556738',
            'rs6681105,C/T,1,581938',
            'rs11240767,C/T,1,718814',
            'rs3094315,C/T,1,742429',
            'rs3131972,C/T,1,742584',
            'rs3131969,C/T,1,744045',
            'rs1048488,C/T,1,750775',
            'rs73885319,A/G,22,36526907',
            'rs60910145,T/G,22,36527035',
            'rs71785313,TTATAA/-,22,36527047',
            'rs143830837,TTATAA/-,22,36527047',
        ]) + '\n'
    )
    print(f'CSV file created: {csv_path}')
    print(f'Content:\n{csv_path.read_text()}')
    return csv_path

decodeme_file = create_apol1_decodeme_file()
print('✓ DecodeME sample ready!')


CSV file created: apol1_decodeme.csv
Content:
Name,Variation,Chromosome,Position
rs4477212,A/G,1,72017
rs2185539,C/T,1,556738
rs6681105,C/T,1,581938
rs11240767,C/T,1,718814
rs3094315,C/T,1,742429
rs3131972,C/T,1,742584
rs3131969,C/T,1,744045
rs1048488,C/T,1,750775
rs73885319,A/G,22,36526907
rs60910145,T/G,22,36527035
rs71785313,TTATAA/-,22,36527047
rs143830837,TTATAA/-,22,36527047

✓ DecodeME sample ready!


In [18]:
!bioscript classify classify_apol1.py --file apol1_decodeme.csv --participant_id="DECODE"
!cat result_APOL1_DECODE.tsv


[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1
[bioscript] Provided SNP file argument: apol1_decodeme.csv
[bioscript] Provided path absolute? False
[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_decodeme.csv
[bioscript] Resolved exists? True
[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, README.md, __pycache__, apol1-classifier, apol1_decodeme.csv, apol1_dev.ipynb, apol1_headerless.txt, apol1_myheritage.csv, apol1_test_g1.tsv, classify_apol1.py, genotype_files, process_samplesheet.sh, result_APOL1_DECODE.tsv, result_APOL1_HEADERLESS.tsv, result_APOL1_MYHERITAGE.tsv, result_APOL1_TEST_APOL1.tsv, results.tsv, samplesheet.csv, test_snps.txt, test_snps_p002.txt, test_snps_p003.txt
[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_decodeme.csv
participant_id=DECODE
APOL

In [19]:
# Example MyHeritage CSV with build information in header comments
from pathlib import Path

def create_apol1_myheritage_file():
    csv_path = Path('apol1_myheritage.csv')
    csv_path.write_text(
        '\n'.join([
            '# MyHeritage DNA raw data.',
            '# This file was generated on 2018-02-14 04:06:47',
            '# For each SNP, we provide the identifier, chromosome number, base pair position and genotype. The genotype is reported on the forward (+) strand with respect to the human reference build 37.',
            '# THIS INFORMATION IS FOR YOUR PERSONAL USE AND IS INTENDED FOR GENEALOGICAL RESEARCH ONLY. IT IS NOT INTENDED FOR MEDICAL OR HEALTH PURPOSES.',
            '# PLEASE BE AWARE THAT THE DOWNLOADED DATA WILL NO LONGER BE PROTECTED BY OUR SECURITY MEASURES.',
            'RSID,CHROMOSOME,POSITION,RESULT',
            '"rs4477212","1","82154","AA"',
            '"rs3094315","1","752566","--"',
            '"rs3131972","1","752721","GG"',
            '"rs12562034","1","768448","--"',
            '"rs12124819","1","776546","--"',
            '"rs12913832","15","28365618","--" # HERC2',
            '"rs73885319","22","36661906","AG" # APOL1 test entries',
            '"rs60910145","22","36662034","TG" # APOL1 test entries',
            '"rs71785313","22","36662046","ID" # APOL1 test entries',
            '"rs143830837","22","36662046","ID" # APOL1 test entries',
        ]) + '\n'
    )
    print(f'CSV file created: {csv_path}')
    print(f'Content:\n{csv_path.read_text()}')
    return csv_path

myheritage_file = create_apol1_myheritage_file()
print('✓ MyHeritage sample ready!')


CSV file created: apol1_myheritage.csv
Content:
# MyHeritage DNA raw data.
# This file was generated on 2018-02-14 04:06:47
# For each SNP, we provide the identifier, chromosome number, base pair position and genotype. The genotype is reported on the forward (+) strand with respect to the human reference build 37.
# THIS INFORMATION IS FOR YOUR PERSONAL USE AND IS INTENDED FOR GENEALOGICAL RESEARCH ONLY. IT IS NOT INTENDED FOR MEDICAL OR HEALTH PURPOSES.
# PLEASE BE AWARE THAT THE DOWNLOADED DATA WILL NO LONGER BE PROTECTED BY OUR SECURITY MEASURES.
RSID,CHROMOSOME,POSITION,RESULT
"rs4477212","1","82154","AA"
"rs3094315","1","752566","--"
"rs3131972","1","752721","GG"
"rs12562034","1","768448","--"
"rs12124819","1","776546","--"
"rs12913832","15","28365618","--" # HERC2
"rs73885319","22","36661906","AG" # APOL1 test entries
"rs60910145","22","36662034","TG" # APOL1 test entries
"rs71785313","22","36662046","ID" # APOL1 test entries
"rs143830837","22","36662046","ID" # APOL1 test entrie

In [20]:
!bioscript classify classify_apol1.py --file apol1_myheritage.csv --participant_id="MYHERITAGE"
!cat result_APOL1_MYHERITAGE.tsv


[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1
[bioscript] Provided SNP file argument: apol1_myheritage.csv
[bioscript] Provided path absolute? False
[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_myheritage.csv
[bioscript] Resolved exists? True
[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, README.md, __pycache__, apol1-classifier, apol1_decodeme.csv, apol1_dev.ipynb, apol1_headerless.txt, apol1_myheritage.csv, apol1_test_g1.tsv, classify_apol1.py, genotype_files, process_samplesheet.sh, result_APOL1_DECODE.tsv, result_APOL1_HEADERLESS.tsv, result_APOL1_MYHERITAGE.tsv, result_APOL1_TEST_APOL1.tsv, results.tsv, samplesheet.csv, test_snps.txt, test_snps_p002.txt, test_snps_p003.txt
[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_myheritage.csv
participant_id=MYHER

In [21]:
# Example headerless genotype file (space-delimited)
from pathlib import Path

def create_apol1_headerless_file():
    tsv_path = Path('apol1_headerless.txt')
    tsv_path.write_text(
        '\n'.join([
            'rs3934834 1 995669 CC',
            'rs6687776 1 1020428 CC',
            'rs9651273 1 1021403 AA',
            'rs4970420 1 1096336 GG',
            'rs11260549 1 1111657 GG',
            'rs2887286 1 1145994 TT',
            'rs7515488 1 1153667 CC',
            'rs11804831 1 1184667 TT',
            'rs880051 1 1483590 GG',
            'rs2296716 1 1487687 CC',
            '',
            'rs73885319 22 36265860 AG',
            'rs60910145 22 36265988 TG',
            'rs71785313 22 36266000 ID',
            'rs143830837 22 36266000 ID',
        ]) + '\n'
    )
    print(f'Headerless file created: {tsv_path}')
    print(f'Content:\n{tsv_path.read_text()}')
    return tsv_path

headerless_file = create_apol1_headerless_file()
print('✓ Headerless sample ready!')


Headerless file created: apol1_headerless.txt
Content:
rs3934834 1 995669 CC
rs6687776 1 1020428 CC
rs9651273 1 1021403 AA
rs4970420 1 1096336 GG
rs11260549 1 1111657 GG
rs2887286 1 1145994 TT
rs7515488 1 1153667 CC
rs11804831 1 1184667 TT
rs880051 1 1483590 GG
rs2296716 1 1487687 CC

rs73885319 22 36265860 AG
rs60910145 22 36265988 TG
rs71785313 22 36266000 ID
rs143830837 22 36266000 ID

✓ Headerless sample ready!


In [22]:
!bioscript classify classify_apol1.py --file apol1_headerless.txt --participant_id="HEADERLESS"
!cat result_APOL1_HEADERLESS.tsv


[bioscript] Current working directory: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1
[bioscript] Provided SNP file argument: apol1_headerless.txt
[bioscript] Provided path absolute? False
[bioscript] Resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_headerless.txt
[bioscript] Resolved exists? True
[bioscript] CWD contents: .DS_Store, .ipynb_checkpoints, .pytest_cache, README.md, __pycache__, apol1-classifier, apol1_decodeme.csv, apol1_dev.ipynb, apol1_headerless.txt, apol1_myheritage.csv, apol1_test_g1.tsv, classify_apol1.py, genotype_files, process_samplesheet.sh, result_APOL1_DECODE.tsv, result_APOL1_HEADERLESS.tsv, result_APOL1_MYHERITAGE.tsv, result_APOL1_TEST_APOL1.tsv, results.tsv, samplesheet.csv, test_snps.txt, test_snps_p002.txt, test_snps_p003.txt
[bioscript] Using resolved SNP path: /Users/madhavajay/dev/biovault-desktop/workspace8/bioscript/examples/apol1/apol1_headerless.txt
participant_id=HEADE