From fa015fccd9064ea06915b0f8052b990726a2307f Mon Sep 17 00:00:00 2001
From: Sam Nicholls <sam.n@studio8media.co.uk>
Date: Fri, 27 Jan 2017 10:49:25 +0000
Subject: [PATCH] GOTTAGOFAST

* Fix deletion finding bug (seems pysam returns indel==0 is is_del=1 as
    the CIGAR parser "peeks" forward, rather than returning the current)
* Add a symbol for deletions, this appears to enhance performance on
    very messy data (lots of structucal variation that we might
    not want to ignore, on HIV-1, for example)
* Force a minimum reweight value, this might be a bad idea
---
 gretel/cmd.py    | 18 +++++++++++++++---
 gretel/gretel.py |  4 ++--
 gretel/util.py   | 48 +++++++++++++++++++++++-------------------------
 3 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/gretel/cmd.py b/gretel/cmd.py
index 0f326ae..b580ea2 100644
--- a/gretel/cmd.py
+++ b/gretel/cmd.py
@@ -122,18 +122,19 @@ def main():
         "G": [],
         "T": [],
         "N": [],
+        "-": [],
         "_": [],
         "total": [],
     }
     if not ARGS.quiet:
-        print "i\tpos\tgap\tA\tC\tG\tT\tN\t_\ttot"
+        print "i\tpos\tgap\tA\tC\tG\tT\tN\t-\t_\ttot"
         last_rev = 0
         for i in range(0, VCF_h["N"]+1):
             marginal = BAM_h["read_support"].get_counts_at(i)
             snp_rev = 0
             if i > 0:
                 snp_rev = VCF_h["snp_rev"][i-1]
-            print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % (
+            print "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d" % (
                 i,
                 snp_rev,
                 snp_rev - last_rev,
@@ -142,6 +143,7 @@ def main():
                 marginal.get("G", 0),
                 marginal.get("T", 0),
                 marginal.get("N", 0),
+                marginal.get("-", 0),
                 marginal.get("_", 0),
                 marginal.get("total", 0),
             )
@@ -150,6 +152,7 @@ def main():
             all_marginals["G"].append(marginal.get("G", 0))
             all_marginals["T"].append(marginal.get("T", 0))
             all_marginals["N"].append(marginal.get("N", 0))
+            all_marginals["-"].append(marginal.get("-", 0))
             all_marginals["_"].append(marginal.get("_", 0))
             all_marginals["total"].append(
                 marginal.get("total", 0)
@@ -165,6 +168,11 @@ def main():
         if init_path == None:
             break
         current_path = init_path
+
+        MIN_REMOVE = 0.01 # 1%
+        if init_min < MIN_REMOVE:
+            sys.stderr.write("[RWGT] Ratio %.10f too small, adjusting to %.3f\n" % (init_min, MIN_REMOVE))
+            init_min = MIN_REMOVE
         rw_magnitude = gretel.reweight_hansel_from_path(BAM_h["read_support"], init_path, init_min)
 
         #TODO Horribly inefficient.
@@ -192,7 +200,11 @@ def main():
             for j, mallele in enumerate(path[1:]):
                 snp_pos_on_master = VCF_h["snp_rev"][j]
                 try:
-                    seq[snp_pos_on_master-1] = mallele
+                    if mallele == "-":
+                        # It's a deletion, don't print a SNP
+                        seq[snp_pos_on_master-1] = ""
+                    else:
+                        seq[snp_pos_on_master-1] = mallele
                 except IndexError:
                     print path, len(seq), snp_pos_on_master-1
                     sys.exit(1)
diff --git a/gretel/gretel.py b/gretel/gretel.py
index 4d8280a..fa67598 100644
--- a/gretel/gretel.py
+++ b/gretel/gretel.py
@@ -57,7 +57,7 @@ def reweight_hansel_from_path(hansel, path, ratio):
                     t_i = i
                     t_j = j
                 size += hansel.reweight_observation(path[t_i], path[t_j], t_i, t_j, ratio)
-    sys.stderr.write("[REWT] Ratio %.3f, Removed %.1f\n" % (ratio, size))
+    sys.stderr.write("[RWGT] Ratio %.3f, Removed %.1f\n" % (ratio, size))
     return size
 
 
@@ -174,7 +174,7 @@ def process_bam(vcf_handler, bam_path, contig_name, start_pos, end_pos, L, use_e
 
 
     meta = util.load_from_bam(None, bam_path, contig_name, start_pos, end_pos, vcf_handler, use_end_sentinels, n_threads)
-    hansel = Hansel(meta["hansel"], ['A', 'C', 'G', 'T', 'N', "_"], ['N', "_"], L=L)
+    hansel = Hansel(meta["hansel"], ['A', 'C', 'G', 'T', 'N', "-", "_"], ['N', "_"], L=L)
 
     if hansel.L == 0:
         hansel.L = meta["L"]
diff --git a/gretel/util.py b/gretel/util.py
index 7c1cf6d..8b0e706 100644
--- a/gretel/util.py
+++ b/gretel/util.py
@@ -84,8 +84,8 @@ def load_from_bam(h, bam_path, target_contig, start_pos, end_pos, vcf_handler, u
     meta = {}
 
 
-    hansel = np.frombuffer(Array(ctypes.c_float, 6 * 6 * (vcf_handler["N"]+2) * (vcf_handler["N"]+2), lock=False), dtype=ctypes.c_float)
-    hansel = hansel.reshape(6, 6, vcf_handler["N"]+2, vcf_handler["N"]+2)
+    hansel = np.frombuffer(Array(ctypes.c_float, 7 * 7 * (vcf_handler["N"]+2) * (vcf_handler["N"]+2), lock=False), dtype=ctypes.c_float)
+    hansel = hansel.reshape(7, 7, vcf_handler["N"]+2, vcf_handler["N"]+2)
     hansel.fill(0.0)
 
     import random
@@ -110,13 +110,13 @@ def progress_worker(progress_q, n_workers, slices, total_snps):
         return (slices, total_snps)
 
     def bam_worker(bam_q, progress_q, worker_i):
+        symbols = ['A', 'C', 'G', 'T', 'N', '-', '_']
+        symbols_d = {symbol: i for i, symbol in enumerate(symbols)}
         def __symbol_num(symbol):
-            symbols = ['A', 'C', 'G', 'T', 'N', '_']
-            #TODO Catch potential IndexError
+            #TODO Catch potential KeyError
             #TODO Generic mechanism for casing (considering non-alphabetically named states, too...)
-            return symbols.index(symbol)
+            return symbols_d[symbol]
 
-        symbols = ['A', 'C', 'G', 'T', 'N', '_']
         unsymbols = ['_', 'N']
         worker = worker_i
 
@@ -135,7 +135,6 @@ def __symbol_num(symbol):
                 })
                 break
 
-
             reads = {}
             for p_col in bam.pileup(reference=target_contig, start=work_block["start"]-1, end=work_block["end"]):
                 if p_col.reference_pos + 1 > end_pos:
@@ -165,33 +164,23 @@ def __symbol_num(symbol):
                                 # Read ends before the start_pos
                                 continue
                             LEFTMOST_1pos = start_pos
+                            #continue
                     else:
                         # This read begins before the start of the current (non-0) block
                         # and will have already been covered by the block that preceded it
                         if LEFTMOST_1pos < work_block["start"]:
                             continue
 
-                    if curr_read_name not in reads:
-                        reads[curr_read_name] = {
-                            "rank": np.sum(vcf_handler["region"][1 : LEFTMOST_1pos]),
-                            "seq": [],
-                            "quals": [],
-                            "refs_1pos": [],
-                            "read_variants_0pos": [],
-                        }
-
-
                     ## Read ends after the end_pos of interest, so clip it
                     #if RIGHTMOST_1pos > work_block["region_end"]:
                     #    RIGHTMOST_1pos = work_block["region_end"]
 
                     sequence = None
                     qual = None
-                    if not p_read.query_position:
-                        # qpos is None for deletion and reference skips
+                    if p_read.is_del:
                         # TODO Not sure about how to estimate quality of deletion?
-                        sequence = "_" * abs(p_read.indel)
-                        qual = p_read.alignment.query_qualities[p_read.query_position_or_next] * abs(p_read.indel)
+                        sequence = "-" * (abs(p_read.indel) + 1)
+                        qual = p_read.alignment.query_qualities[p_read.query_position_or_next] * (abs(p_read.indel) + 1)
                     elif p_read.indel > 0:
                         sequence = p_read.alignment.query_sequence[p_read.query_position : p_read.query_position + p_read.indel + 1]
                         qual = p_read.alignment.query_qualities[p_read.query_position : p_read.query_position + p_read.indel + 1]
@@ -203,6 +192,14 @@ def __symbol_num(symbol):
                         print("Help!")
                         continue
 
+                    if curr_read_name not in reads:
+                        reads[curr_read_name] = {
+                            "rank": np.sum(vcf_handler["region"][1 : LEFTMOST_1pos]),
+                            "seq": [],
+                            "quals": [],
+                            "refs_1pos": [],
+                            "read_variants_0pos": [],
+                        }
                     reads[curr_read_name]["seq"].append(sequence)
                     reads[curr_read_name]["quals"].append(qual)
                     reads[curr_read_name]["refs_1pos"].append(p_col.reference_pos+1)
@@ -213,9 +210,9 @@ def __symbol_num(symbol):
 
             num_reads = len(reads)
             for qi, qname in enumerate(reads):
-                progress_q.put({"pos": num_reads-qi, "worker_i": worker_i})
+                progress_q.put({"pos": num_reads-(qi+1), "worker_i": worker_i})
 
-                if not len(reads[qname]["seq"]) > 0:
+                if not len(reads[qname]["seq"]) > 1:
                     # Ignore reads without evidence
                     continue
                 slices += 1
@@ -223,9 +220,9 @@ def __symbol_num(symbol):
                 rank = reads[qname]["rank"]
                 support_len = len(reads[qname]["seq"])
 
-                # TODO Still not really sure how to handle indels, our matrix is designed for single symbols... :<
+                # TODO Still not really sure how to handle insertions, our matrix is a bit size-inefficient atm
                 support_seq = "".join([b[0] for b in reads[qname]["seq"]])
-                covered_snps += len(support_seq.replace("N", "").replace("-", ""))
+                covered_snps += len(support_seq.replace("N", "").replace("_", ""))
 
                 # For each position in the supporting sequence (that is, each covered SNP)
                 for i in range(0, support_len):
@@ -276,6 +273,7 @@ def __symbol_num(symbol):
 
     # Queue the wokers
     # TODO Evenly divide, but in future, consider the distn
+    # TODO Also consider in general block0 has more work to do
     window_l = int((end_pos - start_pos) / float(n_threads))
     for window_i, window_pos in enumerate(range(start_pos, end_pos+1, window_l)):
         bam_queue.put({