Permalink
Browse files

GOTTAGOFAST

* Fix deletion finding bug (seems pysam returns indel==0 is is_del=1 as
    the CIGAR parser "peeks" forward, rather than returning the current)
* Add a symbol for deletions, this appears to enhance performance on
    very messy data (lots of structucal variation that we might
    not want to ignore, on HIV-1, for example)
* Force a minimum reweight value, this might be a bad idea
  • Loading branch information...
SamStudio8 committed Jan 27, 2017
1 parent 6676135 commit fa015fccd9064ea06915b0f8052b990726a2307f
Showing with 40 additions and 30 deletions.
  1. +15 −3 gretel/cmd.py
  2. +2 −2 gretel/gretel.py
  3. +23 −25 gretel/util.py
@@ -122,18 +122,19 @@ def main():
"G": [],
"T": [],
"N": [],
"-": [],
"_": [],
"total": [],
}
if not ARGS.quiet:
print "i\tpos\tgap\tA\tC\tG\tT\tN\t_\ttot"
print "i\tpos\tgap\tA\tC\tG\tT\tN\t-\t_\ttot"
last_rev = 0
for i in range(0, VCF_h["N"]+1):
marginal = BAM_h["read_support"].get_counts_at(i)
snp_rev = 0
if i > 0:
snp_rev = VCF_h["snp_rev"][i-1]
print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % (
print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % (
i,
snp_rev,
snp_rev - last_rev,
@@ -142,6 +143,7 @@ def main():
marginal.get("G", 0),
marginal.get("T", 0),
marginal.get("N", 0),
marginal.get("-", 0),
marginal.get("_", 0),
marginal.get("total", 0),
)
@@ -150,6 +152,7 @@ def main():
all_marginals["G"].append(marginal.get("G", 0))
all_marginals["T"].append(marginal.get("T", 0))
all_marginals["N"].append(marginal.get("N", 0))
all_marginals["-"].append(marginal.get("-", 0))
all_marginals["_"].append(marginal.get("_", 0))
all_marginals["total"].append(
marginal.get("total", 0)
@@ -165,6 +168,11 @@ def main():
if init_path == None:
break
current_path = init_path

MIN_REMOVE = 0.01 # 1%
if init_min < MIN_REMOVE:
sys.stderr.write("[RWGT] Ratio %.10f too small, adjusting to %.3f\n" % (init_min, MIN_REMOVE))
init_min = MIN_REMOVE
rw_magnitude = gretel.reweight_hansel_from_path(BAM_h["read_support"], init_path, init_min)

#TODO Horribly inefficient.
@@ -192,7 +200,11 @@ def main():
for j, mallele in enumerate(path[1:]):
snp_pos_on_master = VCF_h["snp_rev"][j]
try:
seq[snp_pos_on_master-1] = mallele
if mallele == "-":
# It's a deletion, don't print a SNP
seq[snp_pos_on_master-1] = ""
else:
seq[snp_pos_on_master-1] = mallele
except IndexError:
print path, len(seq), snp_pos_on_master-1
sys.exit(1)
@@ -57,7 +57,7 @@ def reweight_hansel_from_path(hansel, path, ratio):
t_i = i
t_j = j
size += hansel.reweight_observation(path[t_i], path[t_j], t_i, t_j, ratio)
sys.stderr.write("[REWT] Ratio %.3f, Removed %.1f\n" % (ratio, size))
sys.stderr.write("[RWGT] Ratio %.3f, Removed %.1f\n" % (ratio, size))
return size


@@ -174,7 +174,7 @@ def process_bam(vcf_handler, bam_path, contig_name, start_pos, end_pos, L, use_e


meta = util.load_from_bam(None, bam_path, contig_name, start_pos, end_pos, vcf_handler, use_end_sentinels, n_threads)
hansel = Hansel(meta["hansel"], ['A', 'C', 'G', 'T', 'N', "_"], ['N', "_"], L=L)
hansel = Hansel(meta["hansel"], ['A', 'C', 'G', 'T', 'N', "-", "_"], ['N', "_"], L=L)

if hansel.L == 0:
hansel.L = meta["L"]
@@ -84,8 +84,8 @@ def load_from_bam(h, bam_path, target_contig, start_pos, end_pos, vcf_handler, u
meta = {}


hansel = np.frombuffer(Array(ctypes.c_float, 6 * 6 * (vcf_handler["N"]+2) * (vcf_handler["N"]+2), lock=False), dtype=ctypes.c_float)
hansel = hansel.reshape(6, 6, vcf_handler["N"]+2, vcf_handler["N"]+2)
hansel = np.frombuffer(Array(ctypes.c_float, 7 * 7 * (vcf_handler["N"]+2) * (vcf_handler["N"]+2), lock=False), dtype=ctypes.c_float)
hansel = hansel.reshape(7, 7, vcf_handler["N"]+2, vcf_handler["N"]+2)
hansel.fill(0.0)

import random
@@ -110,13 +110,13 @@ def progress_worker(progress_q, n_workers, slices, total_snps):
return (slices, total_snps)

def bam_worker(bam_q, progress_q, worker_i):
symbols = ['A', 'C', 'G', 'T', 'N', '-', '_']
symbols_d = {symbol: i for i, symbol in enumerate(symbols)}
def __symbol_num(symbol):
symbols = ['A', 'C', 'G', 'T', 'N', '_']
#TODO Catch potential IndexError
#TODO Catch potential KeyError
#TODO Generic mechanism for casing (considering non-alphabetically named states, too...)
return symbols.index(symbol)
return symbols_d[symbol]

symbols = ['A', 'C', 'G', 'T', 'N', '_']
unsymbols = ['_', 'N']
worker = worker_i

@@ -135,7 +135,6 @@ def __symbol_num(symbol):
})
break


reads = {}
for p_col in bam.pileup(reference=target_contig, start=work_block["start"]-1, end=work_block["end"]):
if p_col.reference_pos + 1 > end_pos:
@@ -165,33 +164,23 @@ def __symbol_num(symbol):
# Read ends before the start_pos
continue
LEFTMOST_1pos = start_pos
#continue
else:
# This read begins before the start of the current (non-0) block
# and will have already been covered by the block that preceded it
if LEFTMOST_1pos < work_block["start"]:
continue

if curr_read_name not in reads:
reads[curr_read_name] = {
"rank": np.sum(vcf_handler["region"][1 : LEFTMOST_1pos]),
"seq": [],
"quals": [],
"refs_1pos": [],
"read_variants_0pos": [],
}


## Read ends after the end_pos of interest, so clip it
#if RIGHTMOST_1pos > work_block["region_end"]:
# RIGHTMOST_1pos = work_block["region_end"]

sequence = None
qual = None
if not p_read.query_position:
# qpos is None for deletion and reference skips
if p_read.is_del:
# TODO Not sure about how to estimate quality of deletion?
sequence = "_" * abs(p_read.indel)
qual = p_read.alignment.query_qualities[p_read.query_position_or_next] * abs(p_read.indel)
sequence = "-" * (abs(p_read.indel) + 1)
qual = p_read.alignment.query_qualities[p_read.query_position_or_next] * (abs(p_read.indel) + 1)
elif p_read.indel > 0:
sequence = p_read.alignment.query_sequence[p_read.query_position : p_read.query_position + p_read.indel + 1]
qual = p_read.alignment.query_qualities[p_read.query_position : p_read.query_position + p_read.indel + 1]
@@ -203,6 +192,14 @@ def __symbol_num(symbol):
print("Help!")
continue

if curr_read_name not in reads:
reads[curr_read_name] = {
"rank": np.sum(vcf_handler["region"][1 : LEFTMOST_1pos]),
"seq": [],
"quals": [],
"refs_1pos": [],
"read_variants_0pos": [],
}
reads[curr_read_name]["seq"].append(sequence)
reads[curr_read_name]["quals"].append(qual)
reads[curr_read_name]["refs_1pos"].append(p_col.reference_pos+1)
@@ -213,19 +210,19 @@ def __symbol_num(symbol):

num_reads = len(reads)
for qi, qname in enumerate(reads):
progress_q.put({"pos": num_reads-qi, "worker_i": worker_i})
progress_q.put({"pos": num_reads-(qi+1), "worker_i": worker_i})

if not len(reads[qname]["seq"]) > 0:
if not len(reads[qname]["seq"]) > 1:
# Ignore reads without evidence
continue
slices += 1

rank = reads[qname]["rank"]
support_len = len(reads[qname]["seq"])

# TODO Still not really sure how to handle indels, our matrix is designed for single symbols... :<
# TODO Still not really sure how to handle insertions, our matrix is a bit size-inefficient atm
support_seq = "".join([b[0] for b in reads[qname]["seq"]])
covered_snps += len(support_seq.replace("N", "").replace("-", ""))
covered_snps += len(support_seq.replace("N", "").replace("_", ""))

# For each position in the supporting sequence (that is, each covered SNP)
for i in range(0, support_len):
@@ -276,6 +273,7 @@ def __symbol_num(symbol):

# Queue the wokers
# TODO Evenly divide, but in future, consider the distn
# TODO Also consider in general block0 has more work to do
window_l = int((end_pos - start_pos) / float(n_threads))
for window_i, window_pos in enumerate(range(start_pos, end_pos+1, window_l)):
bam_queue.put({

0 comments on commit fa015fc

Please sign in to comment.