In [5]:
import os
import subprocess
import io
import pprint
import json
import pandas as pd

# operations
OP_ANAL=True
OP_PRINT_CMD=False
OP_TVER_ONLY=""

# globals
GATK="../../gatk"
OUT="output"
READS="data/chr20.flow.bam"
REF="../ref/Homo_sapiens_assembly38.fasta"
INTERVALS="chr20:6000000-7000000"

# functions
def HaplotypeCaller(intervals, test, tver, opts):

    # skip?
    if len(OP_TVER_ONLY) and OP_TVER_ONLY != tver:
        return None
    
    # prepare
    outFolder = "%s/%s/%s" % (OUT, test, tver)
    tpref = "%s/%s-%s" % (outFolder, test, tver)
    vcf = "%s.vcf" % tpref
    bam = "%s.bam" % tpref
    os.system("mkdir -p " + outFolder)

    # build main command
    cmd = """
            %s HaplotypeCaller -R %s -I %s -intervals %s \
            --likelihood-calculation-engine FlowBased \
            --bam-writer-type ALL_POSSIBLE_HAPLOTYPES --debug-assembly false \
            -O %s \
            --bam-output %s \
            %s \
    """ % (GATK, REF, READS, intervals, vcf, bam, opts)

    # execute it
    print("Running %s-%s over %s opts %s" % (test, tver, intervals, opts))
    lines = []
    if OP_PRINT_CMD:
        print(cmd)
    with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            lines.append(line)
            if "ProgressMeter" in line:
                print(line, end = '')

    # analyze vcf
    anal_cmd = "bcftools stats %s  | grep -E '^SN' | cut -f 3-4 | sed 's/number of //g' | sed 's/ /-/g' | sed 's/://g'" \
                    % vcf
    anal = {"_vcf": vcf, "_test": test, "_tver": tver, "_intervals": intervals, "_opts": opts}
    with subprocess.Popen(anal_cmd, stdout=subprocess.PIPE, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            toks = line.split()
            anal[toks[0].lower()] = toks[1]
    pprint.pprint(anal)
    with io.open(tpref + "-anal.json", 'w') as file:
        file.write(json.dumps(anal))

    return anal

# perform a test
if OP_ANAL:
    #HaplotypeCaller(INTERVALS, "test1", "baseline", "")
    HaplotypeCaller(INTERVALS, "test1", "collapse12m", "--ultima-assembly-collapse-hmer-size 12 --ultima-flow-matrix-mods 10,12,11,12")
    HaplotypeCaller(INTERVALS, "test1", "collapse10m", "--ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods 11,10,12,10")



Running test1-collapse12m over chr20:6000000-7000000 opts --ultima-assembly-collapse-hmer-size 12 --ultima-flow-matrix-mods 10,12,11,12
17:40:24.219 INFO  ProgressMeter - Starting traversal
17:40:24.219 INFO  ProgressMeter -        Current Locus  Elapsed Minutes     Regions Processed   Regions/Minute
17:40:34.646 INFO  ProgressMeter -        chr20:6019757              0.2                   150            863.1
17:40:44.926 INFO  ProgressMeter -        chr20:6051124              0.3                   390           1130.1
17:40:55.037 INFO  ProgressMeter -        chr20:6075768              0.5                   570           1109.7
17:41:05.061 INFO  ProgressMeter -        chr20:6111737              0.7                   820           1204.6
17:41:15.195 INFO  ProgressMeter -        chr20:6134077              0.8                  1000           1177.0
17:41:25.309 INFO  ProgressMeter -        chr20:6161117              1.0                  1190           1168.8
17:41:35.322 INFO  Progres

17:51:42.888 INFO  ProgressMeter -        chr20:6702634              4.7                  5240           1116.6
17:51:53.345 INFO  ProgressMeter -        chr20:6726978              4.9                  5430           1115.6
17:52:03.901 INFO  ProgressMeter -        chr20:6756118              5.0                  5650           1120.3
17:52:14.648 INFO  ProgressMeter -        chr20:6778633              5.2                  5820           1114.4
17:52:24.686 INFO  ProgressMeter -        chr20:6797638              5.4                  5970           1107.7
17:52:34.844 INFO  ProgressMeter -        chr20:6832148              5.6                  6220           1118.9
17:52:45.830 INFO  ProgressMeter -        chr20:6862078              5.7                  6440           1121.6
17:52:56.295 INFO  ProgressMeter -        chr20:6886976              5.9                  6630           1120.6
17:53:06.605 INFO  ProgressMeter -        chr20:6918555              6.1                  6860          