From fa015fccd9064ea06915b0f8052b990726a2307f Mon Sep 17 00:00:00 2001 From: Sam Nicholls Date: Fri, 27 Jan 2017 10:49:25 +0000 Subject: [PATCH] GOTTAGOFAST * Fix deletion finding bug (seems pysam returns indel==0 is is_del=1 as the CIGAR parser "peeks" forward, rather than returning the current) * Add a symbol for deletions, this appears to enhance performance on very messy data (lots of structucal variation that we might not want to ignore, on HIV-1, for example) * Force a minimum reweight value, this might be a bad idea --- gretel/cmd.py | 18 +++++++++++++++--- gretel/gretel.py | 4 ++-- gretel/util.py | 48 +++++++++++++++++++++++------------------------- 3 files changed, 40 insertions(+), 30 deletions(-) diff --git a/gretel/cmd.py b/gretel/cmd.py index 0f326ae..b580ea2 100644 --- a/gretel/cmd.py +++ b/gretel/cmd.py @@ -122,18 +122,19 @@ def main(): "G": [], "T": [], "N": [], + "-": [], "_": [], "total": [], } if not ARGS.quiet: - print "i\tpos\tgap\tA\tC\tG\tT\tN\t_\ttot" + print "i\tpos\tgap\tA\tC\tG\tT\tN\t-\t_\ttot" last_rev = 0 for i in range(0, VCF_h["N"]+1): marginal = BAM_h["read_support"].get_counts_at(i) snp_rev = 0 if i > 0: snp_rev = VCF_h["snp_rev"][i-1] - print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % ( + print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % ( i, snp_rev, snp_rev - last_rev, @@ -142,6 +143,7 @@ def main(): marginal.get("G", 0), marginal.get("T", 0), marginal.get("N", 0), + marginal.get("-", 0), marginal.get("_", 0), marginal.get("total", 0), ) @@ -150,6 +152,7 @@ def main(): all_marginals["G"].append(marginal.get("G", 0)) all_marginals["T"].append(marginal.get("T", 0)) all_marginals["N"].append(marginal.get("N", 0)) + all_marginals["-"].append(marginal.get("-", 0)) all_marginals["_"].append(marginal.get("_", 0)) all_marginals["total"].append( marginal.get("total", 0) @@ -165,6 +168,11 @@ def main(): if init_path == None: break current_path = init_path + + MIN_REMOVE = 0.01 # 1% + if init_min < MIN_REMOVE: + sys.stderr.write("[RWGT] Ratio %.10f too small, adjusting to %.3f\n" % (init_min, MIN_REMOVE)) + init_min = MIN_REMOVE rw_magnitude = gretel.reweight_hansel_from_path(BAM_h["read_support"], init_path, init_min) #TODO Horribly inefficient. @@ -192,7 +200,11 @@ def main(): for j, mallele in enumerate(path[1:]): snp_pos_on_master = VCF_h["snp_rev"][j] try: - seq[snp_pos_on_master-1] = mallele + if mallele == "-": + # It's a deletion, don't print a SNP + seq[snp_pos_on_master-1] = "" + else: + seq[snp_pos_on_master-1] = mallele except IndexError: print path, len(seq), snp_pos_on_master-1 sys.exit(1) diff --git a/gretel/gretel.py b/gretel/gretel.py index 4d8280a..fa67598 100644 --- a/gretel/gretel.py +++ b/gretel/gretel.py @@ -57,7 +57,7 @@ def reweight_hansel_from_path(hansel, path, ratio): t_i = i t_j = j size += hansel.reweight_observation(path[t_i], path[t_j], t_i, t_j, ratio) - sys.stderr.write("[REWT] Ratio %.3f, Removed %.1f\n" % (ratio, size)) + sys.stderr.write("[RWGT] Ratio %.3f, Removed %.1f\n" % (ratio, size)) return size @@ -174,7 +174,7 @@ def process_bam(vcf_handler, bam_path, contig_name, start_pos, end_pos, L, use_e meta = util.load_from_bam(None, bam_path, contig_name, start_pos, end_pos, vcf_handler, use_end_sentinels, n_threads) - hansel = Hansel(meta["hansel"], ['A', 'C', 'G', 'T', 'N', "_"], ['N', "_"], L=L) + hansel = Hansel(meta["hansel"], ['A', 'C', 'G', 'T', 'N', "-", "_"], ['N', "_"], L=L) if hansel.L == 0: hansel.L = meta["L"] diff --git a/gretel/util.py b/gretel/util.py index 7c1cf6d..8b0e706 100644 --- a/gretel/util.py +++ b/gretel/util.py @@ -84,8 +84,8 @@ def load_from_bam(h, bam_path, target_contig, start_pos, end_pos, vcf_handler, u meta = {} - hansel = np.frombuffer(Array(ctypes.c_float, 6 * 6 * (vcf_handler["N"]+2) * (vcf_handler["N"]+2), lock=False), dtype=ctypes.c_float) - hansel = hansel.reshape(6, 6, vcf_handler["N"]+2, vcf_handler["N"]+2) + hansel = np.frombuffer(Array(ctypes.c_float, 7 * 7 * (vcf_handler["N"]+2) * (vcf_handler["N"]+2), lock=False), dtype=ctypes.c_float) + hansel = hansel.reshape(7, 7, vcf_handler["N"]+2, vcf_handler["N"]+2) hansel.fill(0.0) import random @@ -110,13 +110,13 @@ def progress_worker(progress_q, n_workers, slices, total_snps): return (slices, total_snps) def bam_worker(bam_q, progress_q, worker_i): + symbols = ['A', 'C', 'G', 'T', 'N', '-', '_'] + symbols_d = {symbol: i for i, symbol in enumerate(symbols)} def __symbol_num(symbol): - symbols = ['A', 'C', 'G', 'T', 'N', '_'] - #TODO Catch potential IndexError + #TODO Catch potential KeyError #TODO Generic mechanism for casing (considering non-alphabetically named states, too...) - return symbols.index(symbol) + return symbols_d[symbol] - symbols = ['A', 'C', 'G', 'T', 'N', '_'] unsymbols = ['_', 'N'] worker = worker_i @@ -135,7 +135,6 @@ def __symbol_num(symbol): }) break - reads = {} for p_col in bam.pileup(reference=target_contig, start=work_block["start"]-1, end=work_block["end"]): if p_col.reference_pos + 1 > end_pos: @@ -165,33 +164,23 @@ def __symbol_num(symbol): # Read ends before the start_pos continue LEFTMOST_1pos = start_pos + #continue else: # This read begins before the start of the current (non-0) block # and will have already been covered by the block that preceded it if LEFTMOST_1pos < work_block["start"]: continue - if curr_read_name not in reads: - reads[curr_read_name] = { - "rank": np.sum(vcf_handler["region"][1 : LEFTMOST_1pos]), - "seq": [], - "quals": [], - "refs_1pos": [], - "read_variants_0pos": [], - } - - ## Read ends after the end_pos of interest, so clip it #if RIGHTMOST_1pos > work_block["region_end"]: # RIGHTMOST_1pos = work_block["region_end"] sequence = None qual = None - if not p_read.query_position: - # qpos is None for deletion and reference skips + if p_read.is_del: # TODO Not sure about how to estimate quality of deletion? - sequence = "_" * abs(p_read.indel) - qual = p_read.alignment.query_qualities[p_read.query_position_or_next] * abs(p_read.indel) + sequence = "-" * (abs(p_read.indel) + 1) + qual = p_read.alignment.query_qualities[p_read.query_position_or_next] * (abs(p_read.indel) + 1) elif p_read.indel > 0: sequence = p_read.alignment.query_sequence[p_read.query_position : p_read.query_position + p_read.indel + 1] qual = p_read.alignment.query_qualities[p_read.query_position : p_read.query_position + p_read.indel + 1] @@ -203,6 +192,14 @@ def __symbol_num(symbol): print("Help!") continue + if curr_read_name not in reads: + reads[curr_read_name] = { + "rank": np.sum(vcf_handler["region"][1 : LEFTMOST_1pos]), + "seq": [], + "quals": [], + "refs_1pos": [], + "read_variants_0pos": [], + } reads[curr_read_name]["seq"].append(sequence) reads[curr_read_name]["quals"].append(qual) reads[curr_read_name]["refs_1pos"].append(p_col.reference_pos+1) @@ -213,9 +210,9 @@ def __symbol_num(symbol): num_reads = len(reads) for qi, qname in enumerate(reads): - progress_q.put({"pos": num_reads-qi, "worker_i": worker_i}) + progress_q.put({"pos": num_reads-(qi+1), "worker_i": worker_i}) - if not len(reads[qname]["seq"]) > 0: + if not len(reads[qname]["seq"]) > 1: # Ignore reads without evidence continue slices += 1 @@ -223,9 +220,9 @@ def __symbol_num(symbol): rank = reads[qname]["rank"] support_len = len(reads[qname]["seq"]) - # TODO Still not really sure how to handle indels, our matrix is designed for single symbols... :< + # TODO Still not really sure how to handle insertions, our matrix is a bit size-inefficient atm support_seq = "".join([b[0] for b in reads[qname]["seq"]]) - covered_snps += len(support_seq.replace("N", "").replace("-", "")) + covered_snps += len(support_seq.replace("N", "").replace("_", "")) # For each position in the supporting sequence (that is, each covered SNP) for i in range(0, support_len): @@ -276,6 +273,7 @@ def __symbol_num(symbol): # Queue the wokers # TODO Evenly divide, but in future, consider the distn + # TODO Also consider in general block0 has more work to do window_l = int((end_pos - start_pos) / float(n_threads)) for window_i, window_pos in enumerate(range(start_pos, end_pos+1, window_l)): bam_queue.put({