In [45]:
# cnv_realign - using detected cnv areas, realign cram file using jump aligner around cnv edges

In [46]:
# argumennt processing + debug environment files
import sys
import os

MATCH_SCORE = 2
MISMATCH_SCORE = -8
OPEN_SCORE = -18
EXTEND_SCORE = -1
JUMP_SCORE = 0
MODE="DEL"

print(sys.argv)
tmp = "/tmp/jump_align_input." + str(os.getpid())
JUMP_ALIGN_CMD = ["jump_align/jump_align", str(MATCH_SCORE), str(MISMATCH_SCORE), str(OPEN_SCORE), str(EXTEND_SCORE), "-1", str(JUMP_SCORE), tmp]
if "cnv_realign" in sys.argv[0] or "stdin" in sys.argv[0]:
    if len(sys.argv) < 5:
        print("usage: " + sys.argv[0] + " <input-cram> <range-bed> <ref-fasta> <output-prefix> [<mode>]\n")
        sys.exit(-1)
    # commandline invocation
    IN_CRAM = sys.argv[1]
    CNV_BED = sys.argv[2]
    REF_FASTA = sys.argv[3]
    OUT_SAM = sys.argv[4] + ".sam"
    if len(sys.argv) == 6:
        MODE = sys.argv[5]
else:
    IN_CRAM = "data/chr22_1M.cram"
    IN_CRAM = os.path.expanduser("~/tmp/data/jump_align/chr22.cram")
    IN_CRAM = os.path.expanduser("~/tmp/data/jump_align/chr1.cram")
    CNV_BED = "data/250204/chr1_5.bed"
    CNV_BED = "data/chr1_dup.bed"
    REF_FASTA = os.path.expanduser("~/tmp/ref/Homo_sapiens_assembly38.fasta")
    OUT_SAM = "/tmp/cnv_realign." + str(os.getpid()) + ".sam"
    MODE = "DUP"


FETCH_READ_PADDING = 500
FETCH_REF_PADDING = 0
MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT = 30
MIN_GAP_LEN = 30
MAX_READS_PER_CNV = 2000

JUMP_SCORE_THRESHOLD = 0.85
REF_SCORE_THRESHOLD = 1.0

OUT_BAM = OUT_SAM.replace(".sam", ".bam")
OUT_BED = OUT_SAM.replace(".sam", ".bed")

debug = True;
debug_break_after_one_region = False
debug_break_after_debug_seen = False
debug_output_debug_only = False

CNV_BED

['/Users/drorkessler/miniconda3/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/Users/drorkessler/Library/Jupyter/runtime/kernel-1021f323-a398-4daa-8a27-ba38ca190e3c.json']


'data/chr1_dup.bed'

In [47]:
# open files
import pysam
reads_file = pysam.AlignmentFile(IN_CRAM, "rb", reference_filename=REF_FASTA)
fasta_file = pysam.FastaFile(REF_FASTA)

In [48]:
import subprocess

def run_process(command, input_path):
  command 
  print("command", command) 
    
  try:
    process = subprocess.Popen(command, 
                              stdin=subprocess.PIPE, 
                              stdout=subprocess.PIPE, 
                              stderr=subprocess.PIPE, 
                              text=True) 

    stdout, stderr = process.communicate()
    returncode = process.returncode
    #print(stderr)

  except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")
    return None, e.returncode

  return stdout, returncode

In [49]:
import tempfile
import random

random.seed(0)

# process a single cnv
def process_cnv(chrom, start, end, mode):
    if debug: print("process_cnv: ", locals())

    # get all reads that cross the two cnv edges
    reads = dict()
    reads_in_ref = [set(), set()]
    refs = []
    refs_extents = []
    ref_id = 0
    for loc in [start, end]:
        rmin = max(0, loc - FETCH_READ_PADDING)
        rmax = loc + FETCH_READ_PADDING
        for read in reads_file.fetch(chrom, max(0, loc - FETCH_READ_PADDING), loc + FETCH_READ_PADDING):
            reads[read.qname] = read
            rmin = min(rmin, read.reference_start)
            rmax = max(rmax, read.reference_end)
            reads_in_ref[ref_id].add(read.qname)
        refs_extents.append([rmin, rmax])
        ref_id += 1
        
    # extend references before and after
    refs_extents[0][0] = max(0, refs_extents[0][0] - FETCH_REF_PADDING)
    refs_extents[1][1] = refs_extents[1][1] + FETCH_REF_PADDING
    if mode == "DUP":
        # make refernces be the same in repeat mode
        refs_extents[0][0] = min(refs_extents[0][0], refs_extents[1][0])
        refs_extents[1][0] = refs_extents[0][0]
        refs_extents[0][1] = max(refs_extents[0][1], refs_extents[1][1])
        refs_extents[1][1] = refs_extents[0][1]
    if debug: print("found", len(reads), "reads") 
    if debug: print("refs_extents", refs_extents)

    # get references
    for extents in refs_extents:
        rmin, rmax = extents
        ref = fasta_file.fetch(chrom, rmin, rmax)
        refs.append([rmin, ref])
    #if debug: print("refs", refs)

    # create input file for jump aligner
    ref_emitted = False
    reads_in_order = []
    subsample_ratio = 1.0
    if len(reads) > MAX_READS_PER_CNV:
        subsample_ratio = MAX_READS_PER_CNV / len(reads)
        print("subsample_ratio", subsample_ratio)
    with open(tmp, 'w') as f:
        for read in reads.values():
            if subsample_ratio < 1.0:
                if random.random() > subsample_ratio:
                    continue
            reads_in_order.append(read)
            if not ref_emitted:
                line = read.qname + "\t" + read.seq + "\t" + refs[0][1] + "\t" + refs[1][1] + "\n"
                ref_emitted = True
            else:
                line = read.qname + "\t" + read.seq + "\t=\n"
            f.write(line)
    if debug: print("tmp", tmp)

    # run jump_align
    alignments = run_process(JUMP_ALIGN_CMD, tmp)
    header_seen = False
    realignments = []
    rheader = []
    for alignment, read in zip(alignments[0].split("\n"), [None, *reads_in_order]):
        if not header_seen:
            rheader = alignment.split("\t")
            header_seen = True;
        else:
            a = alignment.split("\t")
            in1 = read.qname in reads_in_ref[0]
            in2 = read.qname in reads_in_ref[1]
            realignments.append([read, refs[0][0], refs[1][0], a, in1, in2])
    return (rheader, realignments)


In [55]:
def sized_substr(s, l):
    return [s[ofs:ofs+l] for ofs in range(len(s) - l + 1)]

def sized_dict(s, l):
    d = {}
    for ss in sized_substr(s, l):
        if ss in d:
            d[ss] += 1
        else:
            d[ss] = 1
    return d

def sized_list(s, l):
    sl = [(x,y) for x,y in sized_dict(s, l).items()]
    return sorted(sl, key=lambda x: x[1], reverse=True)

    return sl

def top_sized(s):
    ts = [sized_list(s,x)[0] for x in range(2, len(s)-2)]
    return ts


In [61]:
import statistics

# open output sam file
print("OUT_SAM", OUT_SAM)
sam_file = pysam.AlignmentFile(OUT_SAM, "w", header=reads_file.header)

debug_qname = "036742_1-Z0027-0519663978"
debug_seen = False

# loop on bed file, write output bed file
with open(OUT_BED, "w") as out_bed:
    with open(CNV_BED) as f:
        for line in f:
            bed_line = line.strip().split()
            bed_chrom, bed_start, bed_end = bed_line[:3]
            bed_start = int(bed_start)
            bed_end = int(bed_end)
            rheader, realignments = process_cnv(bed_chrom, bed_start, bed_end, MODE)
            jump_read_written = 0
            jump_read_lowscore = 0
            jump_read_lessthanref = 0
            jdelsize = []
            jjumpland = []
            for realignment in realignments:
                in_ref = [False, False]
                read, ref1_start, ref2_start, ainfo, in_ref[0], in_ref[1] = realignment
                if read.qname == debug_qname:
                    print("ref_start1/2", ref1_start, ref2_start)
                    print("in_ref", in_ref)
                    for e in [(x,y) for x,y in zip(rheader, ainfo)]:
                        print("%s: %s" % e)
                    debug_seen = True
    
                # write original read
                tags = read.tags
                read.tags = tags + [('JT', 0)]
    
                if not debug_output_debug_only or read.qname == debug_qname:
                    sam_file.write(read)
    
                # decode alignment info
                qname1, score, jumpInsertSize, jumpRange, \
                    jbegin1, jcigar1, jreadlen1, jreflen1, \
                    jbegin2, jcigar2, jreadlen2, jreflen2, \
                    score1, begin1, cigar1, readlen1, reflen1, \
                    score2, begin2, cigar2, readlen2, reflen2 = ainfo
    
                # build read aligned to references
                qname = read.qname
                reference_start = read.reference_start
                if qname == debug_qname: print("reference_start", reference_start)
                ref_score = -1000000
                if qname != qname1:
                    printf("qname mismatch!", qname, qname1)
                for i in [1,2]:
                    if not in_ref[i-1]:
                        continue
                    read.qname = qname + "_REF" + str(i)
                    read.tags = tags + [('JT', i)]    
                    if i == 1:
                        read.cigarstring = cigar1
                        read.reference_start = ref1_start + int(begin1)
                        read.tags += [('JS', int(score1))]
                        ref_score = max(ref_score, int(score1))
                    else:
                        read.cigarstring = cigar2
                        read.reference_start = ref2_start + int(begin2)
                        read.tags += [('JS', int(score2))]
                        ref_score = max(ref_score, int(score2))
                    if not debug_output_debug_only or qname == debug_qname:    
                        sam_file.write(read)

                # build read aligned to both references with a jump in the middle
                if len(jcigar1) and len(jcigar2) :

                    jt = 0
                    if int(jreadlen1) < MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT \
                        or int(jreadlen2) < MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT \
                        or int(jreflen1) < MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT \
                        or int(jreflen2) < MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT:
                        jt += 50

                    # adjust jump score to include jumpInsertSize
                    jump_score = int(score)
                    if int(jumpInsertSize):
                        jump_score += (OPEN_SCORE + int(jumpInsertSize) * EXTEND_SCORE)

                    # calculate ratio of jump score to idle score 
                    ideal_score = len(read.seq) * MATCH_SCORE
                    ideal_ratio = int(jump_score) / ideal_score
                    #print(qname, "ideal_score", ideal_score, "ideal_ratio", ideal_ratio, "jump_score", jump_score)
                    if ideal_ratio < JUMP_SCORE_THRESHOLD:
                        jt += 100
                        jump_read_lowscore += 1
                    if jump_score <= (ref_score * REF_SCORE_THRESHOLD):
                        jt += 200
                        jump_read_lessthanref += 1                        
                    end_on_ref1 = ref1_start + int(jbegin1) + int(jreflen1)
                    start_on_ref2 = ref2_start + int(jbegin2)
                    land = (start_on_ref2 - bed_start) / (bed_end - bed_start)
                    delta = start_on_ref2 - end_on_ref1
                    read.qname = qname + "_JUMP"
                    
                    new_cigar = jcigar1
                    if MODE == "DEL":
                        if int(jumpInsertSize):
                            new_cigar += jumpInsertSize + "I"
                        if delta > 0:
                            new_cigar += str(delta) + "D"
                    elif MODE == "DUP":
                        delta = len(read.seq) - int(jreadlen1) - int(jreadlen2)
                        if delta > 0:
                            new_cigar += str(delta) + "I"
                            s1 = read.seq[:int(jreadlen1)]
                            s2 = read.seq[int(jreadlen1):-int(jreadlen2)]
                            s3 = read.seq[-int(jreadlen2):]
                            if qname == debug_qname:
                                print("s1", s1)
                                print("s2", s2)
                                print("s3", s3)
                                ts = top_sized(iseq)
                                print("top_sized", ts[:20])
                            tags += [('J1', s1), ('J2', s2), ('J3', s3)]
                        delta = start_on_ref2 - end_on_ref1
                        if delta > 0:
                            new_cigar += str(delta) + "D"
                    
                    if delta < MIN_GAP_LEN:
                        jt += 400
                    new_cigar += jcigar2
                    read.cigarstring = new_cigar
                    read.reference_start = ref1_start + int(jbegin1)
                    if jt == 0:
                        jt = 3
                    read.tags = tags + [('JT', jt)]   
                    read.tags += [('JS', int(jump_score))]
                    jdelsize.append(delta)
                    jjumpland.append(land)
                    if not debug_output_debug_only or qname == debug_qname:    
                        sam_file.write(read)  
                        if jt == 3:
                            jump_read_written += 1
                    if qname == debug_qname: print("read.reference_start", read.reference_start)
                    if qname == debug_qname:
                        print("jump new_cigar", new_cigar)
                            
    
            if debug: print("jump_read_written", jump_read_written, "jump_read_lowscore", jump_read_lowscore, "jump_read_lessthanref", jump_read_lessthanref)
            if len(jdelsize):
                stats = [str(min(jdelsize)), str(max(jdelsize)), str(statistics.mean(jdelsize)), "%.3f" % min(jjumpland), "%.3f" % max(jjumpland), "%.3f" % statistics.mean(jjumpland)]
            else:
                stats = ["0" for x in range(6)]
            out_bed.write(line[:-1] + ("\t%d\t%d\t%d\t%s\n" % (jump_read_written, jump_read_lowscore, jump_read_lessthanref, "\t".join(stats))))
            
            if debug_break_after_one_region:
                break
            if debug_break_after_debug_seen and debug_seen:
                break

sam_file.close()

# convert to sorted bam
cmd = "samtools sort " + OUT_SAM + " >" + OUT_BAM
print("cmd", cmd)
os.system(cmd)
cmd = "samtools index " + OUT_BAM
print("cmd", cmd)
os.system(cmd)

OUT_SAM /tmp/cnv_realign.73611.sam
process_cnv:  {'chrom': 'chr1', 'start': 4333017, 'end': 4334096, 'mode': 'DUP'}
found 216 reads
refs_extents [[4332192, 4334907], [4332192, 4334907]]
tmp /tmp/jump_align_input.73611
command ['jump_align/jump_align', '2', '-8', '-18', '-1', '-1', '0', '/tmp/jump_align_input.73611']
ref_start1/2 4332192 4332192
in_ref [False, True]
readName: 036742_1-Z0027-0519663978
score: 431
jumpInsertSize: 65
jumpRange: 0
jbegin1: 146
japath1: 4=1X1=1X5=
jreadlen1: 12
jreflen1: 12
jbegin2: 1383
japath2: 3=1X146=1X105=
jreadlen2: 256
jreflen2: 256
score1: 403
begin1: 1383
apath1: 3=78I1D146=1X105=
readlen1: 333
reflen1: 256
score2: 403
begin2: 1383
apath2: 3=78I1D146=1X105=
readlen2: 333
reflen2: 256
reference_start 4333579
s1 CCACCATCTGCA
s2 AGCAAGGAAGGGAGCCCTCATGAGGAATGAGTCAGCTGGAAAGCTGGAACCTTGATCTTGGACTT
s3 CCAACCTCCAGAACTGTTAGAACGTAAATGTCTGTTGTTGGAGGGGCCCAGGAAGGAGGAAAGGAGGGAGAATGTGGTCTCTGGAAGCTGAGAGAAGAGGGGATTTTAGGAAAGTGTTGGAGAAAAACGTGTTGAATGGTGCTGAGAGGTAAAAAGTC

0