In [5]:
import os
import subprocess
import io
import pprint
import json
import pandas as pd

# operations
OP_ANAL=True
OP_PRINT_CMD=True
OP_TVER_ONLY=""

# globals
GATK="../../gatk"
OUT="output"
REF="../ref/Homo_sapiens_assembly38.fasta"

#READS="data/chr20.flow.bam"
#INTERVALS="chr20:6000000-7000000"

READS="data/200603/140258_probability_E5_Z7.aligned.duplicate_marked.sorted.bam"
INTERVALS="chr9:1700000-2000000"

# functions
def HaplotypeCaller(intervals, test, tver, opts):

    # skip?
    if len(OP_TVER_ONLY) and OP_TVER_ONLY != tver:
        return None
    
    # prepare
    outFolder = "%s/%s/%s" % (OUT, test, tver)
    tpref = "%s/%s-%s" % (outFolder, test, tver)
    vcf = "%s.vcf" % tpref
    bam = "%s.bam" % tpref
    os.system("mkdir -p " + outFolder)

    # build main command
    cmd = """
            %s HaplotypeCaller -R %s -I %s -intervals %s \
            --likelihood-calculation-engine FlowBased \
            --bam-writer-type ALL_POSSIBLE_HAPLOTYPES --debug-assembly false \
            -O %s \
            --bam-output %s \
            %s \
    """ % (GATK, REF, READS, intervals, vcf, bam, opts)

    # execute it
    print("Running %s-%s over %s opts %s" % (test, tver, intervals, opts))
    lines = []
    if OP_PRINT_CMD:
        print(cmd)
    with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            lines.append(line)
            if "ProgressMeter" in line:
                print(line, end = '')

    # analyze vcf
    anal_cmd = "bcftools stats %s  | grep -E '^SN' | cut -f 3-4 | sed 's/number of //g' | sed 's/ /-/g' | sed 's/://g'" \
                    % vcf
    anal = {"_vcf": vcf, "_test": test, "_tver": tver, "_intervals": intervals, "_opts": opts}
    with subprocess.Popen(anal_cmd, stdout=subprocess.PIPE, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            toks = line.split()
            anal[toks[0].lower()] = toks[1]
    pprint.pprint(anal)
    with io.open(tpref + "-anal.json", 'w') as file:
        file.write(json.dumps(anal))

    return anal

# perform a test
if OP_ANAL:
    HaplotypeCaller(INTERVALS, "test2", "baseline", "")
    HaplotypeCaller(INTERVALS, "test2", "collapse12m", "--ultima-assembly-collapse-hmer-size 12 --ultima-flow-matrix-mods 10,12,11,12")
    HaplotypeCaller(INTERVALS, "test2", "collapse10m", "--ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods 11,10,12,10")



Running test2-baseline over chr9:1700000-2000000 opts 

            ../../gatk HaplotypeCaller -R ../ref/Homo_sapiens_assembly38.fasta -I data/200603/140258_probability_E5_Z7.aligned.duplicate_marked.sorted.bam -intervals chr9:1700000-2000000             --likelihood-calculation-engine FlowBased             --bam-writer-type ALL_POSSIBLE_HAPLOTYPES --debug-assembly false             -O output/test2/baseline/test2-baseline.vcf             --bam-output output/test2/baseline/test2-baseline.bam                  
09:22:11.333 INFO  ProgressMeter - Starting traversal
09:22:11.333 INFO  ProgressMeter -        Current Locus  Elapsed Minutes     Regions Processed   Regions/Minute
09:22:21.383 INFO  ProgressMeter -         chr9:1736048              0.2                   290           1731.3
09:22:31.457 INFO  ProgressMeter -         chr9:1847992              0.3                  1080           3220.2
09:22:41.604 INFO  ProgressMeter -         chr9:1961070              0.5                  1920  