In [10]:
# sa walker - extract supplementary alignment regions from SA tags

In [23]:
import os
import sys

# args
print(sys.argv)
if "sa_walker" in sys.argv[0] or "stdin" in sys.argv[0]:
    if len(sys.argv) != 6:
        print("usage: " + sys.argv[0] + " <input-cram> <in-bed> <ref-fasta> <edge-threshold> <out-bed>\n")
        sys.exit(-1)
    # commandline invocation
    IN_CRAM = sys.argv[1]
    IN_BED = sys.argv[2]
    REF_FASTA = sys.argv[3]
    EDGE_THRESHOLD = int(sys.argv[4])
    OUT_BED = sys.argv[5]
else:
    IN_CRAM = os.path.expanduser("~/tmp/data/jump_align/chr22.cram")
    IN_BED = os.path.expanduser("~/tmp/data/jump_align/chr22_all.bed")
    REF_FASTA = os.path.expanduser("~/tmp/ref/Homo_sapiens_assembly38.fasta")
    EDGE_THRESHOLD = 0
    OUT_BED = "/tmp/sa_walker." + str(os.getpid()) + ".bed"

QUANTIZE=20

['/Users/drorkessler/miniconda3/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/Users/drorkessler/Library/Jupyter/runtime/kernel-01c5df6e-55eb-41a2-ad16-e5315716b698.json']


In [24]:
from itertools import groupby

def query_len(cigar_string):
    read_consuming_ops = ("M", "I", "S", "=", "X")
    result = 0
    cig_iter = groupby(cigar_string, lambda chr: chr.isdigit())
    for _, length_digits in cig_iter:
        length = int(''.join(length_digits))
        op = next(next(cig_iter)[1])
        if op in read_consuming_ops:
            result += length
    return result

In [25]:
import networkx as nx
graph = nx.DiGraph()

def process_gap(lchrom, lloc, rchrom, rloc):
    #print("process_gap", [lchrom, lloc, rchrom, rloc])
    if not "_" in lchrom + rchrom:
        lnode = (lchrom, lloc - lloc % QUANTIZE + int(QUANTIZE / 2))
        rnode = (rchrom, rloc - rloc % QUANTIZE + int(QUANTIZE / 2))
        if not graph.has_node(lnode):
            graph.add_node(lnode)
        if not graph.has_node(rnode):
            graph.add_node(rnode)
        if graph.has_edge(lnode, rnode):
            data = graph.get_edge_data(lnode, rnode)
            data["c"] = data["c"] + 1
        else:
            graph.add_edge(lnode, rnode, c=1)


In [26]:

def process_sa(read):

    # current alignment represents the left side of the gap
    lchrom = read.reference_name
    lloc = read.reference_end

    sa_all = read.get_tag("SA").split(";")
    if sa_all[-1] == '':
        sa_all = sa_all[:-1]
    #print("sa_all", sa_all)
    for sa_index in range(len(sa_all)):
        toks = sa_all[sa_index].split(",")

        # establish right side of gap
        #print("sa_index", sa_index, "toks", toks)
        rchrom = toks[0]
        rloc = int(toks[1])

        # register this gap
        process_gap(lchrom, lloc, rchrom, rloc)

        # establish next left side, if there is one
        if sa_index + 1 < len(sa_all):
            lchrom = rchrom
            lloc = rloc + query_len(toks[3])
    

In [27]:
import pysam

with pysam.AlignmentFile(IN_CRAM, "rb", reference_filename=REF_FASTA) as samf:
    
    # loop on bed regions
    read_count = 0
    supp_count = 0
    with open(IN_BED) as f:
        for line in f:
            bed_line = line.strip().split()
            chrom, start, end = bed_line[:3]
            start = int(start)
            end = int(end)

            # loop on reads
            for read in samf.fetch(chrom, start, end):
                read_count += 1
                if read.is_supplementary:
                    supp_count += 1
                    process_sa(read)

                if read_count % 1000000 == 0:
                    print("read_count", read_count, "supp_count", supp_count, "read", read.reference_name + ":" + str(read.reference_start))
                    sys.stdout.flush()



read_count 1000000 supp_count 75914 read chr22:18160110
read_count 2000000 supp_count 92395 read chr22:25215724
read_count 3000000 supp_count 104880 read chr22:32152302
read_count 4000000 supp_count 117092 read chr22:39096855
read_count 5000000 supp_count 130211 read chr22:46172271


In [28]:
# collect potential gaps
gaps = []
for edge in graph.edges:
    c = graph.edges[edge[0], edge[1]]["c"]
    if c > 2:
        gaps.append([edge[0], edge[1], c])

# filter on threshold
gaps = [x for x in gaps if x[2] >= EDGE_THRESHOLD]
print("len(gaps)", len(gaps))

# sort of left edge
def sort_key(chrom):
    if chrom == "chrX":
        return 23
    elif chrom == "chrY":
        return 24
    elif chrom == "chrM":
        return 25
    else:
        return int(chrom[3:])
    
gaps = sorted(gaps, key=lambda x: (sort_key(x[0][0]), x[0][1], sort_key(x[1][0]), x[1][1]))
for gap in gaps[:5]:
    print(gap)

# write to output file
print("OUT_BED", OUT_BED)
with open(OUT_BED, "w") as f:
    for gap in gaps:
        f.write("%s\t%d\t%s\t%d\t%d\n" % (gap[0][0], gap[0][1], gap[1][0], gap[1][1], gap[2]))

len(gaps) 3854
[('chr1', 23884670), ('chr1', 90846950), 4]
[('chr1', 167056830), ('chr7', 157299690), 3]
[('chr1', 224015030), ('chr10', 42320530), 3]
[('chr2', 4471670), ('chr17', 12652030), 3]
[('chr2', 89836810), ('chr2', 89836470), 3]
OUT_BED /tmp/sa_walker.18605.bed


In [37]:
import itertools as it

# filter gap to be within size threshold
MIN_GAP_LEN = 100
MAX_GAP_LEN = 10000

# gap filtering
def gap_ok(gap):
    lside = gap[0]
    rside = gap[1]
    if lside[0] != rside[0]:
        return False
    delta = rside[1] - lside[1]
    if delta < MIN_GAP_LEN or delta > MAX_GAP_LEN:
        return False
    return True
fgaps = [gap for gap in gaps if gap_ok(gap)]
print("len(fgaps)", len(fgaps))
fname = OUT_BED.replace(".bed", "_fgaps.bed")
print("fnamne", fname)
with open(fname, "w") as f:
    for gap in fgaps:
        f.write("%s\t%d\t%d\t%d\n" % (gap[0][0], gap[0][1], gap[1][1], gap[2]))
        
# grouping
def group_gap(g):
    gl = list(g[1])
    print("gl", gl)
    end = max([elem[1][1] for elem in gl])
    first = gl[0]
    return [first[0], (first[1][0], end), first[2]]
    
ggaps = [group_gap(g) for g in it.groupby(fgaps, lambda x: x[0][1])]



len(fgaps) 99
fnamne /tmp/sa_walker.18605_fgaps.bed
gl [90386230, <itertools._grouper object at 0x103c0eb30>]


TypeError: 'int' object is not subscriptable

In [35]:
ggaps

[(90386230, <itertools._grouper at 0x1082c3040>),
 (93470610, <itertools._grouper at 0x1082c0d90>),
 (34588450, <itertools._grouper at 0x108252890>),
 (10710930, <itertools._grouper at 0x108251e40>),
 (10717470, <itertools._grouper at 0x1049fc460>),
 (10717890, <itertools._grouper at 0x1049fc610>),
 (10718810, <itertools._grouper at 0x108296cb0>),
 (10721130, <itertools._grouper at 0x1082973d0>),
 (10721150, <itertools._grouper at 0x108297070>),
 (10721250, <itertools._grouper at 0x104136c80>),
 (10721630, <itertools._grouper at 0x104136a10>),
 (10723050, <itertools._grouper at 0x104137040>),
 (10726490, <itertools._grouper at 0x104136f80>),
 (10728170, <itertools._grouper at 0x104137100>),
 (10728190, <itertools._grouper at 0x104137400>),
 (10779210, <itertools._grouper at 0x104136740>),
 (10781350, <itertools._grouper at 0x1041374c0>),
 (10781370, <itertools._grouper at 0x104136920>),
 (11058530, <itertools._grouper at 0x104137340>),
 (11211910, <itertools._grouper at 0x104136e00>),
