
Originally named 8.5_parse-vcf.ipynb

Parse various outputs from secondary and make it available to the summary documents.

-  Parse VCF:
    Takes the vcf file for a sample and makes a human-readable output.
    ["Don't write home-brewed VCF parsing scripts. It never ends well."](https://gatkforums.broadinstitute.org/gatk/discussion/1268/what-is-a-vcf-and-how-should-i-interpret-it)
   

 - Parse fusion results

- Parse FLT3-ITD results

In [None]:
import os
import csv
import glob
import json
import logging
from distutils.version import LooseVersion

with open("conf.json","r") as conf:
    c=json.load(conf)
sample_id = c["sample_id"]    
print("Running on sample: {}".format(sample_id))
logging.basicConfig(**c["info"]["logging_config"])

def and_log(s):
    logging.info(s)
    return s

j = {}
print and_log("Running Parse Secondary Results: VCF, Fusion, FLT3-ITD Detection...")

In [None]:
# Helper functions

# Basedir - must include "-*" at the end for glob to find
# Finds the most recent version of this directory
# and looks for the fixed file path within it
def locate_secondary_file(basedir, filepath):
    
    all_vardirs = sorted(glob.glob(os.path.join(c["dir"]["secondary"], basedir)),
                        key=LooseVersion)
    # Get the file, or most recent if there are more than one.
    if len(all_vardirs) >= 1:
        result = os.path.join(all_vardirs[-1], filepath)
        if not os.access(result, os.R_OK): # Check that the file is present and readable
            print and_log("Secondary output dir for {} was located but the desired file {} was not readable.".format(
                basedir, filepath))
            return ""
    else:
        print and_log("No secondary output dir {} located for this sample.".format(basedir))
        result = ""
    return result


# The format of the info field is k=v, delimited by ;
# (although a Flag item may be present with no = sign but simply k=v;k;k=v)
# All the info items we use - gene, type, and aa change - are in the EFF sub-field.
# The format of the EFF sub-field is k(v|v|v),k(v|v|v)
def parse_info(info):
    result = {}

    info_items = map(lambda x: x.split("="), info.split(";"))
    # For now, drop all Flag items as we don't need them anyhow
    info_dict = {k : v for (k, v) in filter(lambda x: len(x) ==2, info_items) }
    
    # Get the EFF items and parse each
    # Note the first item is "k(v"  and the last is "v)", because we don't care 
    eff_items = map(lambda x: x.split("|"),  info_dict["EFF"].split(","))
        
    # We report the gene and type from the first EFF item only
    first_eff = eff_items.pop(0)
    result["gene"] = first_eff[5]
    result["type"] = first_eff[1]

    # Collect AA changes, starting with the first item. Then collect AA changes only from
    # subsequent items when their gene and type are the same
    found_aa_changes = set([first_eff[3]])
    
    for eff in eff_items:
        if (result["gene"] == eff[5]) and (result["type"] == eff[1]):
            found_aa_changes.add(eff[3])
    
    result["aa change"] = "|".join(found_aa_changes)
    return result

# parse the vcf 'unknown' field
def parse_unknown(uk):
    items = uk.split(":")
    return {
     "genotype": items[0],
     "ref reads": items[2],
     "alt reads": items[4]
    }
    
# parse the FORMAT and 'unknown' fields into their key value pairs
def extra_fields(fmt, unknown):
    fmt_fields = fmt.split(":")
    fmt_values = unknown.split(":")
    return dict(zip(fmt_fields, fmt_values))

In [None]:
# FLT3-ITD report

j["flt3itd_file"] = locate_secondary_file("jpfeil-jfkm-*", "FLT3-ITD.report")
j["flt3itd_events"] = []

j["flt3itd_keys"] = ["Type","Abnormal","Normal"]
if not j["flt3itd_file"]:
    print "No FLT3-ITD report file present!"
else:
    with open(j["flt3itd_file"], "r") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            j["flt3itd_events"].append({k: row[k] for k in j["flt3itd_keys"]})
            
j["flt3itd_events"]

In [None]:
# Fusions
j["fusion_file"] = locate_secondary_file("ucsctreehouse-fusion-*", "star-fusion-gene-list-filtered.final")
j["fusions"] = []

# Save only selected fields
fusion_keys = ["#FusionName","JunctionReadCount","SpanningFragCount"]
j["fusion_keys"] = map(lambda x: x.replace("#", ""), fusion_keys)

if not j["fusion_file"]:
    print "No fusion file present!"
else:
    with open(j["fusion_file"], "r") as f:
        reader = csv.DictReader(f, delimiter="\t")
        for row in reader:
            j["fusions"].append({k.replace("#", ""): row[k] for k in fusion_keys})
j["fusions"]

Modified from https://github.com/UCSC-Treehouse/analysis-methods/blob/master/script/parse_vcf.py

In [None]:
# Variant Call
j["vcf_file"] = locate_secondary_file("ucsctreehouse-mini-var-call-*", "mini.ann.vcf")
print j["vcf_file"]

if not j["vcf_file"]:
    print "No VCF file present!"
else:
    variants = []

    default_FORMAT=""
    vcf_fields=["CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT","unknown"]
    
    with open(j["vcf_file"], "r") as f:
        reader = csv.DictReader((row for row in f if not row.startswith('#')),
                               fieldnames=vcf_fields,
                               delimiter="\t")
        # Get the VCF fields
        for row in reader:
            if default_FORMAT:
                if not row["FORMAT"] == default_FORMAT:
                    print("ERROR! FORMAT field changed from expected!")
                    print("Got: {} Expected: {}".format(row["FORMAT"], default_FORMAT))
                    raise IOError("Unexpected change to FORMAT field in VCF file.")
            else:
                default_FORMAT = row["FORMAT"]
          
            row_details = parse_info(row["INFO"])
            
            row_details.update(parse_unknown(row["unknown"]))
            row_details["quality"] = row["QUAL"]
            row_details["chr"] = row["CHROM"]
            row_details["pos"] = row["POS"]
            row_details["ref"] = row["REF"]
            row_details["alt"] = row["ALT"]
            row_details.update(extra_fields(row["FORMAT"], row["unknown"]))
            
            variants.append(row_details)
            
    field_order= "gene,aa change,type,genotype,ref reads,alt reads,quality,chr,pos,ref,alt".split(",")
    field_order += default_FORMAT.split(":")

    j["variants"] = variants
    # And also store the field_order so it's easier to display downstream
    j["primary_fields"] = field_order[0:3]
    j["extra_fields"] = field_order[3:]
        
    ### Print the output ###   
    # First, print just the INFO
    print( "\t".join(field_order[0:3]))
    for item in variants:
        print("\t".join(map(lambda x: item[x], field_order[0:3])))
 
    # Then print the whole thing
    print
    print "\t".join(field_order)
    for item in variants:
        print("\t".join(map(lambda x: item[x], field_order)))

In [None]:
with open(c["json"]["8.5"], "w") as jsonfile:
    json.dump(j, jsonfile, indent=2)
    
print and_log("Parse secondary results - Done.")