In [1]:
import os
import subprocess
import io
import pprint
import json
import pandas as pd

# operations
OP_ANAL=True
OP_PRINT_CMD=True
OP_TVER_ONLY=""

# globals
GATK="../../gatk"
OUT="output"
REF="../ref/Homo_sapiens_assembly38.fasta"

#READS="data/chr20.flow.bam"
#INTERVALS="chr20:6000000-7000000"
#TESTNAME="test1"

#READS="data/200603/140258_probability_E5_Z7.aligned.duplicate_marked.sorted.bam"
#INTERVALS="chr9:1700000-2000000"
#TESTNAME="test2"

READS="data/200603/140258_probability_E5_Z7.aligned.duplicate_marked.sorted.bam"
INTERVALS="chr9:6600000-7000000"
TESTNAME="test3"

INTERVALS="chr9:6943300-6943900 -L chr9:6745098-6746098 -L chr9:6688000-6689000"

# functions
def HaplotypeCaller(intervals, test, tver, opts):

    # skip?
    if len(OP_TVER_ONLY) and OP_TVER_ONLY != tver:
        return None
    
    # prepare
    outFolder = "%s/%s/%s" % (OUT, test, tver)
    tpref = "%s/%s-%s" % (outFolder, test, tver)
    vcf = "%s.vcf" % tpref
    bam = "%s.bam" % tpref
    os.system("mkdir -p " + outFolder)

    # build main command
    cmd = """
            %s HaplotypeCaller -R %s -I %s -intervals %s \
            --likelihood-calculation-engine FlowBased \
            --bam-writer-type ALL_POSSIBLE_HAPLOTYPES --debug-assembly false -mbq 0 \
            -O %s \
            --bam-output %s \
            %s \
    """ % (GATK, REF, READS, intervals, vcf, bam, opts)

    # execute it
    print("Running %s-%s over %s opts %s" % (test, tver, intervals, opts))
    lines = []
    if OP_PRINT_CMD:
        print(cmd)
    with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            lines.append(line)
            if "ProgressMeter" in line:
                print(line, end = '')

    # analyze vcf
    anal_cmd = "bcftools stats %s  | grep -E '^SN' | cut -f 3-4 | sed 's/number of //g' | sed 's/ /-/g' | sed 's/://g'" \
                    % vcf
    anal = {"_vcf": vcf, "_test": test, "_tver": tver, "_intervals": intervals, "_opts": opts}
    with subprocess.Popen(anal_cmd, stdout=subprocess.PIPE, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            toks = line.split()
            anal[toks[0].lower()] = toks[1]
    pprint.pprint(anal)
    with io.open(tpref + "-anal.json", 'w') as file:
        file.write(json.dumps(anal))

    return anal

# perform a test
if OP_ANAL:
    HaplotypeCaller(INTERVALS, TESTNAME, "baseline", "")
    HaplotypeCaller(INTERVALS, TESTNAME, "collapse12m", "--ultima-assembly-collapse-hmer-size 12 --ultima-flow-matrix-mods 10,12,11,12")
    HaplotypeCaller(INTERVALS, TESTNAME, "collapse10m", "--ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods 11,10,12,10")



Running test3-baseline over chr9:6943300-6943900 -L chr9:6745098-6746098 -L chr9:6688000-6689000 opts 

            ../../gatk HaplotypeCaller -R ../ref/Homo_sapiens_assembly38.fasta -I data/200603/140258_probability_E5_Z7.aligned.duplicate_marked.sorted.bam -intervals chr9:6943300-6943900 -L chr9:6745098-6746098 -L chr9:6688000-6689000             --likelihood-calculation-engine FlowBased             --bam-writer-type ALL_POSSIBLE_HAPLOTYPES --debug-assembly false -mbq 0             -O output/test3/baseline/test3-baseline.vcf             --bam-output output/test3/baseline/test3-baseline.bam                  
10:17:27.239 INFO  ProgressMeter - Starting traversal
10:17:27.239 INFO  ProgressMeter -        Current Locus  Elapsed Minutes     Regions Processed   Regions/Minute
10:17:28.879 INFO  ProgressMeter -         chr9:6943629              0.0                    22            804.9
10:17:28.879 INFO  ProgressMeter - Traversal complete. Processed 22 total regions in 0.0 minutes.
{'_inte