In [1]:
import os
import subprocess
import io
import pprint
import json
import pandas as pd

# operations
OP_ANAL=True
OP_PRINT_CMD=True
OP_TVER_ONLY=""

# globals
GATK="../../gatk"
OUT="output"
REF="../ref/Homo_sapiens_assembly38.fasta"

READS="data/chr20.flow.bam"
INTERVALS="chr20:6000000-7000000"
TESTNAME="test1"

#READS="data/200603/140258_probability_E5_Z7.aligned.duplicate_marked.sorted.bam"
#INTERVALS="chr9:1700000-2000000"
#TESTNAME="test2"

# functions
def HaplotypeCaller(intervals, test, tver, opts):

    # skip?
    if len(OP_TVER_ONLY) and OP_TVER_ONLY != tver:
        return None
    
    # prepare
    outFolder = "%s/%s/%s" % (OUT, test, tver)
    tpref = "%s/%s-%s" % (outFolder, test, tver)
    vcf = "%s.vcf" % tpref
    bam = "%s.bam" % tpref
    os.system("mkdir -p " + outFolder)

    # build main command
    cmd = """
            %s HaplotypeCaller -R %s -I %s -intervals %s \
            --likelihood-calculation-engine FlowBased \
            --bam-writer-type ALL_POSSIBLE_HAPLOTYPES --debug-assembly false \
            -O %s \
            --bam-output %s \
            %s \
    """ % (GATK, REF, READS, intervals, vcf, bam, opts)

    # execute it
    print("Running %s-%s over %s opts %s" % (test, tver, intervals, opts))
    lines = []
    if OP_PRINT_CMD:
        print(cmd)
    with subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            lines.append(line)
            if "ProgressMeter" in line:
                print(line, end = '')

    # analyze vcf
    anal_cmd = "bcftools stats %s  | grep -E '^SN' | cut -f 3-4 | sed 's/number of //g' | sed 's/ /-/g' | sed 's/://g'" \
                    % vcf
    anal = {"_vcf": vcf, "_test": test, "_tver": tver, "_intervals": intervals, "_opts": opts}
    with subprocess.Popen(anal_cmd, stdout=subprocess.PIPE, shell=True) as proc:
        for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"):
            toks = line.split()
            anal[toks[0].lower()] = toks[1]
    pprint.pprint(anal)
    with io.open(tpref + "-anal.json", 'w') as file:
        file.write(json.dumps(anal))

    return anal

# perform a test
if OP_ANAL:
    HaplotypeCaller(INTERVALS, TESTNAME, "baseline", "")
    HaplotypeCaller(INTERVALS, TESTNAME, "collapse12m", "--ultima-assembly-collapse-hmer-size 12 --ultima-flow-matrix-mods 10,12,11,12")
    HaplotypeCaller(INTERVALS, TESTNAME, "collapse10m", "--ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods 11,10,12,10")



Running test1-baseline over chr20:6000000-7000000 opts 

            ../../gatk HaplotypeCaller -R ../ref/Homo_sapiens_assembly38.fasta -I data/chr20.flow.bam -intervals chr20:6000000-7000000             --likelihood-calculation-engine FlowBased             --bam-writer-type ALL_POSSIBLE_HAPLOTYPES --debug-assembly false             -O output/test1/baseline/test1-baseline.vcf             --bam-output output/test1/baseline/test1-baseline.bam                  
14:08:52.312 INFO  ProgressMeter - Starting traversal
14:08:52.312 INFO  ProgressMeter -        Current Locus  Elapsed Minutes     Regions Processed   Regions/Minute
14:09:02.319 INFO  ProgressMeter -        chr20:6019757              0.2                   150            899.5
14:09:12.560 INFO  ProgressMeter -        chr20:6051124              0.3                   390           1155.7
14:09:22.776 INFO  ProgressMeter -        chr20:6077002              0.5                   580           1142.4
14:09:33.197 INFO  ProgressMeter - 

14:18:59.439 INFO  ProgressMeter -        chr20:6498539              3.3                  3750           1144.6
14:19:09.503 INFO  ProgressMeter -        chr20:6521775              3.4                  3930           1141.1
14:19:19.650 INFO  ProgressMeter -        chr20:6553551              3.6                  4150           1148.6
14:19:30.565 INFO  ProgressMeter -        chr20:6575914              3.8                  4320           1138.3
14:19:40.968 INFO  ProgressMeter -        chr20:6600149              4.0                  4490           1131.4
14:19:51.188 INFO  ProgressMeter -        chr20:6627893              4.1                  4690           1133.2
14:20:01.383 INFO  ProgressMeter -        chr20:6654286              4.3                  4870           1130.3
14:20:11.834 INFO  ProgressMeter -        chr20:6679165              4.5                  5050           1126.5
14:20:21.992 INFO  ProgressMeter -        chr20:6705050              4.7                  5260          

{'_intervals': 'chr20:6000000-7000000',
 '_opts': '--ultima-assembly-collapse-hmer-size 10 --ultima-flow-matrix-mods '
          '11,10,12,10',
 '_test': 'test1',
 '_tver': 'collapse10m',
 '_vcf': 'output/test1/collapse10m/test1-collapse10m.vcf',
 'indels': '1419',
 'mnps': '0',
 'multiallelic-sites': '13',
 'multiallelic-snp-sites': '1',
 'no-alts': '0',
 'others': '1',
 'records': '2835',
 'samples': '1',
 'snps': '1417'}
