In [2]:
import os
import subprocess
import io
import pprint
import json
import pandas as pd

# operations
OP_ANAL=True
OP_PRINT_CMD=False
OP_TVER_ONLY="collapse10m2"

# globals
GATK="../../gatk"
OUT="output"
READS="data/chr20.flow.bam"
REF="../ref/Homo_sapiens_assembly38.fasta"
INTERVALS="chr20:6000000-7000000"

# functions
def HaplotypeCaller(intervals, test, tver, opts):

    # skip?
    if len(OP_TVER_ONLY) and OP_TVER_ONLY != tver:
        return None
    
    # prepare
    outFolder = "%s/%s/%s" % (OUT, test, tver)
    tpref = "%s/%s-%s" % (outFolder, test, tver)
    vcf = "%s.vcf" % tpref
    bam = "%s.bam" % tpref
    os.system("mkdir -p " + outFolder)

    # build main command
    cmd = """
            %s HaplotypeCaller -R %s -I %s -intervals %s \
            --likelihood-calculation-engine FlowBased \
            --bam-writer-type ALL_POSSIBLE_HAPLOTYPES --debug-assembly false \
            -O %s \
            --bam-output %s \
            %s \
    """ % (GATK, REF, READS, intervals, vcf, bam, opts)

    # execute it
    print("Running %s-%s over %s opts %s" % (test, tver, intervals, opts))
    lines = []
    if OP_PRINT_CMD:
        print(cmd)
    with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            lines.append(line)
            if "ProgressMeter" in line:
                print(line, end = '')

    # analyze vcf
    anal_cmd = "bcftools stats %s  | grep -E '^SN' | cut -f 3-4 | sed 's/number of //g' | sed 's/ /-/g' | sed 's/://g'" \
                    % vcf
    anal = {"_vcf": vcf, "_test": test, "_tver": tver, "_intervals": intervals, "_opts": opts}
    with subprocess.Popen(anal_cmd, stdout=subprocess.PIPE, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            toks = line.split()
            anal[toks[0].lower()] = toks[1]
    pprint.pprint(anal)
    with io.open(tpref + "-anal.json", 'w') as file:
        file.write(json.dumps(anal))

    return anal

# perform a test
if OP_ANAL:
    HaplotypeCaller(INTERVALS, "test1", "baseline", "")
    HaplotypeCaller(INTERVALS, "test1", "baseline12m", "--ultima-flow-matrix-mods 10,12,11,12")
    HaplotypeCaller(INTERVALS, "test1", "baseline10m", "--ultima-flow-matrix-mods 8,10,9,10,11,10,12,10")

    HaplotypeCaller(INTERVALS, "test1", "collapse12", "--ultima-assembly-collapse-hmer-size 12")
    HaplotypeCaller(INTERVALS, "test1", "collapse12m", "--ultima-assembly-collapse-hmer-size 12 --ultima-flow-matrix-mods 10,12,11,12")
    
    HaplotypeCaller(INTERVALS, "test1", "collapse10", "--ultima-assembly-collapse-hmer-size 10")
    HaplotypeCaller(INTERVALS, "test1", "collapse10m", "--ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods 8,10,9,10,11,10,12,10")
    HaplotypeCaller(INTERVALS, "test1", "collapse10m1", "--ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods 8,10,9,10")
    HaplotypeCaller(INTERVALS, "test1", "collapse10m2", "--ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods 11,10,12,10")



Running test1-collapse10m2 over chr20:6000000-7000000 opts --ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods 11,10,12,10
11:45:25.498 INFO  ProgressMeter - Starting traversal
11:45:25.498 INFO  ProgressMeter -        Current Locus  Elapsed Minutes     Regions Processed   Regions/Minute
11:45:36.203 INFO  ProgressMeter -        chr20:6018361              0.2                   140            784.7
11:45:46.429 INFO  ProgressMeter -        chr20:6042041              0.3                   320            917.3
11:45:57.224 INFO  ProgressMeter -        chr20:6068737              0.5                   520            983.4
11:46:07.577 INFO  ProgressMeter -        chr20:6096849              0.7                   720           1026.6
11:46:17.797 INFO  ProgressMeter -        chr20:6125439              0.9                   920           1055.5
11:46:28.130 INFO  ProgressMeter -        chr20:6147006              1.0                  1090           1044.2
11:46:38.324 INFO  Progre