In [1]:
# sa walker - extract supplementary alignment regions from SA tags

In [105]:
import os
import sys

# constants
IN_CRAM = os.path.expanduser("~/tmp/data/jump_align/chr22.cram")
IN_BED = "data/chr22_all.bed"
REF_FASTA = os.path.expanduser("~/tmp/ref/Homo_sapiens_assembly38.fasta")

OUT_PREFIX = "/tmp/sa_walker." + str(os.getpid())

In [106]:
from itertools import groupby

def query_len(cigar_string):
    read_consuming_ops = ("M", "I", "S", "=", "X")
    result = 0
    cig_iter = groupby(cigar_string, lambda chr: chr.isdigit())
    for _, length_digits in cig_iter:
        length = int(''.join(length_digits))
        op = next(next(cig_iter)[1])
        if op in read_consuming_ops:
            result += length
    return result

In [107]:
import networkx as nx
graph = nx.DiGraph()

def process_gap(lchrom, lloc, rchrom, rloc):
    #print("process_gap", [lchrom, lloc, rchrom, rloc])
    lnode = [lchrom, lloc - lloc % 100]
    rnode = [rchrom, rloc - rloc % 100]
    if not graph.has_node(lnode):
        graph.add_node(lnode)
    if not graph.has_node(rnode):
        graph.add_node(rnode)
    if graph.has_edge(lnode, rnode):
        data = graph.get_edge_data(lnode, rnode)
        data["c"] = data["c"] + 1
    else:
        graph.add_edge(lnode, rnode, c=1)


In [108]:

def process_sa(read):

    # current alignment represents the left side of the gap
    lchrom = read.reference_name
    lloc = read.reference_end

    sa_all = read.get_tag("SA").split(";")
    if sa_all[-1] == '':
        sa_all = sa_all[:-1]
    #print("sa_all", sa_all)
    for sa_index in range(len(sa_all)):
        toks = sa_all[sa_index].split(",")

        # establish right side of gap
        #print("sa_index", sa_index, "toks", toks)
        rchrom = toks[0]
        rloc = int(toks[1])

        # register this gap
        process_gap(lchrom, lloc, rchrom, rloc)

        # establish next left side, if there is one
        if sa_index + 1 < len(sa_all):
            lchrom = rchrom
            lloc = rloc + query_len(toks[3])
    

In [112]:
import pysam

with pysam.AlignmentFile(IN_CRAM, "rb", reference_filename=REF_FASTA) as samf:
    
    # loop on bed regions
    read_count = 0
    supp_count = 0
    with open(IN_BED) as f:
        for line in f:
            bed_line = line.strip().split()
            chrom, start, end = bed_line[:3]
            start = int(start)
            end = int(end)

            # loop on reads
            for read in samf.fetch(chrom, start, end):
                read_count += 1
                if read.is_supplementary:
                    supp_count += 1
                    process_sa(read)

                if read_count % 1000000 == 0:
                    print("read_count", read_count, "supp_count", supp_count, "read", read.reference_name + ":" + str(read.reference_start))



read_count 1000000 supp_count 75914 read chr22:18160110
read_count 2000000 supp_count 92395 read chr22:25215724
read_count 3000000 supp_count 104880 read chr22:32152302
read_count 4000000 supp_count 117092 read chr22:39096855
read_count 5000000 supp_count 130211 read chr22:46172271


In [113]:
# collect potential gaps

gaps = []
for edge in graph.edges:
    c = graph.edges[edge[0], edge[1]]["c"]
    if c > 2:
        print("edge", edge, "c", c)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



c 12
edge ('chr2216345300', 'chr1790100') c 4
edge ('chr2216345300', 'chr1038528900') c 4
edge ('chr2216345300', 'chr22_KI270737v1_random40400') c 8
edge ('chr2216345300', 'chr2110686500') c 24
edge ('chr2216345300', 'chr14_GL000225v1_random199000') c 8
edge ('chr2216345300', 'chr2210721000') c 4
edge ('chr2216345300', 'chr1042306000') c 4
edge ('chr2216345300', 'chr2031168000') c 4
edge ('chr2216345300', 'chr449098600') c 4
edge ('chr2216345300', 'chr2110673400') c 4
edge ('chr2216345300', 'chr2216359700') c 4
edge ('chr2216345300', 'chr22_KI270737v1_random31800') c 20
edge ('chr2216345300', 'chr2110668100') c 4
edge ('chr2216345300', 'chrY56739800') c 4
edge ('chr2216345300', 'chr22_KI270737v1_random31900') c 4
edge ('chr2216345300', 'chrUn_KI270756v128700') c 4
edge ('chr2216345300', 'chr1038518200') c 4
edge ('chr2216345300', 'chr14_GL000225v1_random192700') c 4
edge ('chr2216345300', 'chr2031241600') c 4
edge ('chr2216345300', 'chr2110658600') c 8
edge ('chr2216345300', 'chr211065