In [1]:
# pair_haps - mate pairs of haplotypes that overlap in their softclipped ends

In [2]:
import os
import sys

# args
print(sys.argv)
if "pair_haps" in sys.argv[0] or "stdin" in sys.argv[0]:
    if len(sys.argv) != 4:
        print("usage: " + sys.argv[0] + " <input-cram> <in-bed> <ref-fasta>\n")
        sys.exit(-1)
    # commandline invocation
    IN_CRAM = sys.argv[1]
    IN_BED = sys.argv[2]
    REF_FASTA = sys.argv[3]
else:
    IN_CRAM = os.path.expanduser("~/tmp/data/pair_haps/0030945-Z0114_merged_assembly.bam")
    IN_BED = os.path.expanduser("~/tmp/data/pair_haps/chr4_ex1.bed")
    REF_FASTA = os.path.expanduser("~/tmp/ref/Homo_sapiens_assembly38.fasta")



['/Users/drorkessler/miniconda3/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/Users/drorkessler/Library/Jupyter/runtime/kernel-07b28a21-3ea8-47ad-a6ab-d6136c112a3b.json']


In [67]:
# collect breakpoints from read
bp_quant = 500
def quant(n):
    return round((round(n / bp_quant) + 0.5) * bp_quant)
    
def collect_breakpoints(read):

    lbp = []
    rbp = []
    
    if read.cigartuples[-1][0] == pysam.CSOFT_CLIP:
        lbp.append((quant(read.reference_end), read))
    if read.cigartuples[0][0] == pysam.CSOFT_CLIP:
        rbp.append((quant(read.reference_start), read))

    return (lbp, rbp)    
        
        

In [74]:
# loop on entries from bed file, process one at a time
import pysam
from itertools import groupby

read_count = 0
lbp = []
rbp = []
with pysam.AlignmentFile(IN_CRAM, "rb", reference_filename=REF_FASTA) as samf:
    
    # loop on bed regions
    with open(IN_BED) as f:
        for line in f:
            bed_line = line.strip().split()
            chrom, start, end = bed_line[:3]
            start = int(start)
            end = int(end)

            # loop on reads
            for read in samf.fetch(chrom, start, end):
                read_count += 1
                if read_count % 1000000 == 0:
                    print("read_count", read_count, read.reference_name + ":" + str(read.reference_start))
                    sys.stdout.flush()
                    
                bp = collect_breakpoints(read)
                lbp += bp[0]
                rbp += bp[1]

# sort, groupby
lbp.sort(key=lambda x:x[0])
lbp = [(key, [x[1] for x in group]) for key, group in groupby(lbp, lambda x: x[0])]
rbp.sort(key=lambda x:x[0])
rbp = [(key, [x[1] for x in group]) for key, group in groupby(rbp, lambda x: x[0])]


In [136]:
from difflib import SequenceMatcher

# next we look for lr pairs which are 1K-10K in distance
min_d = 1000
max_d = 10000
max_match_jump = 10
min_match_size = 10
portion_min = 0.5

#h1_debug = ["HC_chr4:64571_1001", "HC_chr4:64571_1015"]
h1_debug = ["HC_chr4:64571_1015"]
h2_debug = ["HC_chr4:67904_1008", "HC_chr4:67904_1006"]
on_debug = True

def haps_are_paired(h1, h2):

    after1 = h1[4]
    before2 = h2[3]
    result = False
    jumps = None

    a, b, size = SequenceMatcher(None, after1, before2).find_longest_match()
    if size >= min_match_size and a <= max_match_jump and b <= max_match_jump:
        p1 = size / len(after1)
        p2 = size / len(before2)
        if max(p1, p2) >= portion_min:
            result = True;
            jumps = (a, b, size, p1, p2)
    
    # debug
    if on_debug and ((h1[0].qname in h1_debug and h2[0].qname in h2_debug)):
        print("-----")
        print("h1")
        for h in h1[1:]:
            print(h)
        print("h2")
        for h in h2[1:]:
            print(h)
        print("result", result, "jumps", jumps)

    return (result, jumps)

def match_haps(lhaps, rhaps):
    h1s = []
    h2s = []
    matched = []
    # extract sequenecs
    for hap in lhaps:
        clip = hap.cigartuples[-1][1]
        before = hap.seq[-clip:]
        after = hap.seq[clip:]
        h1s.append((hap, hap.qname, hap.cigartuples, before, after))
    for hap in rhaps:
        clip = hap.cigartuples[0][1]
        before = hap.seq[-clip:]
        after = hap.seq[clip:]
        h2s.append((hap, hap.qname, hap.cigartuples, before, after))

    # look for matches
    for h1 in h1s:
        for h2 in h2s:
            result, jump = haps_are_paired(h1, h2)
            if result:
                matched.append((h1[0], h2[0], h1[0].qname, h2[0].qname, jump))

    return matched

# this is naive
pairs = []
for l,lhaps in lbp:
    for r,rhaps in rbp:
        if r-l >= min_d and r-l <= max_d:
            pairs += match_haps(lhaps, rhaps)

print("len(pairs)", len(pairs))
for pair in pairs:
    print(pair)


-----
h1
HC_chr4:64571_1015
[(0, 112), (2, 1), (0, 38), (4, 258)]
TGTTTTTTATTATAAAGCTGGGACTACAGGTGCCCGCCACCACGCCCGGCTAATTTTTTGTATTTTAGTAGAGACGGGGTTTCACTGAGTTAGCCAGGATGGTCTCGATCTCCTGACCTCGTGATCCGCCCATTTCGGCCTCCCGAAGTGCTGGGATTACAGGTGTGAGCCACTGCACCACGTCAGGATTGCGTCTTATGTTAATGCGAAATATGCATAGCTGTCATCCGAAGTATACCACCATTCCCTTCAGCACTC
TCTCCTGACCTCGTGATCCGCCCATTTCGGCCTCCCGAAGTGCTGGGATTACAGGTGTGAGCCACTGCACCACGTCAGGATTGCGTCTTATGTTAATGCGAAATATGCATAGCTGTCATCCGAAGTATACCACCATTCCCTTCAGCACTC
h2
HC_chr4:67904_1006
[(4, 74), (0, 175)]
CCTCGTGATCCGCCCATTTCGGCCTCCCGAAGTGCTGGGATTACAGGTGTGAGCCACTGCACCACGTCAGGATT
AGCTGGGACTACAGGTGCCCGCCACCACGCCCGGCTAATTTTTTGTATTTTAGTAGAGACGGGGTTTCACTGAGTTAGCCAGGATGGTCTCGATCTCCTGACCTCGTGATCCGCCCATTTCGGCCTCCCGAAGTGCTGGGATTACAGGTGTGAGCCACTGCACCACGTCAGGATT
result True jumps (8, 0, 74, 0.49333333333333335, 1.0)
-----
h1
HC_chr4:64571_1015
[(0, 112), (2, 1), (0, 38), (4, 258)]
TGTTTTTTATTATAAAGCTGGGACTACAGGTGCCCGCCACCACGCCCGGCTAATTTTTTGTATTTTAGTAGAGACGGGGTTTCACTGAGTTAGCCAGGATGGTCTCGATC

In [148]:
# group by left and right haps
lgrp = [(x,list(g)) for x,g in groupby(sorted(pairs, key=lambda x: x[2]), lambda x: x[2])]
rgrp = [(x,list(g)) for x,g in groupby(sorted(pairs, key=lambda x: x[3]), lambda x: x[3])]
lgrp

[('HC_chr4:41229_1002',
  [(<pysam.libcalignedsegment.AlignedSegment at 0x107d80160>,
    <pysam.libcalignedsegment.AlignedSegment at 0x107d82020>,
    'HC_chr4:41229_1002',
    'HC_chr4:42548_1011',
    (2, 6, 10, 0.06622516556291391, 0.625))]),
 ('HC_chr4:41229_1003',
  [(<pysam.libcalignedsegment.AlignedSegment at 0x107d80280>,
    <pysam.libcalignedsegment.AlignedSegment at 0x107d82020>,
    'HC_chr4:41229_1003',
    'HC_chr4:42548_1011',
    (3, 6, 10, 0.06578947368421052, 0.625))]),
 ('HC_chr4:41229_1004',
  [(<pysam.libcalignedsegment.AlignedSegment at 0x107d80580>,
    <pysam.libcalignedsegment.AlignedSegment at 0x107d82020>,
    'HC_chr4:41229_1004',
    'HC_chr4:42548_1011',
    (2, 6, 10, 0.06578947368421052, 0.625))]),
 ('HC_chr4:64571_1001',
  [(<pysam.libcalignedsegment.AlignedSegment at 0x107d8ab00>,
    <pysam.libcalignedsegment.AlignedSegment at 0x107d88b80>,
    'HC_chr4:64571_1001',
    'HC_chr4:67904_1006',
    (9, 0, 74, 0.4900662251655629, 1.0)),
   (<pysam.libcal