In [1]:
# cnv_realign - using detected cnv areas, realign cram file using jump aligner around cnv edges

In [7]:
# argumennt processing + debug environment files
import sys
import os
print(sys.argv)
tmp = "/tmp/jump_align_input." + str(os.getpid())
JUMP_ALIGN_CMD = ("jump_align/jump_align 2 -8 -18 -1 -1 0 " + tmp).split(" ")
if "cnv_realign" in sys.argv[0] or "stdin" in sys.argv[0]:
    # commandline invocation
    IN_CRAM = sys.argv[1]
    CNV_BED = sys.argv[2]
    REF_FASTA = sys.argv[3]
    OUT_SAM = sys.argv[4] + ".sam"
else:
    IN_CRAM = "data/chr22_1M.cram"
    CNV_BED = "data/chr22.bed"
    REF_FASTA = os.path.expanduser("~/tmp/ref/Homo_sapiens_assembly38.fasta")
    OUT_SAM = "/tmp/chr22_cnv_realign." + str(os.getpid()) + ".sam"


# debug overrides
# IN_CRAM = os.path.expanduser("~/tmp/data/jump_align/chr22.cram")
# CNV_BED = "data/chr22_ex3.bed"


FETCH_READ_PADDING = 500
FETCH_REF_PADDING = 1500
MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT = 30

OUT_BAM = OUT_SAM.replace(".sam", ".bam")
OUT_BED = OUT_SAM.replace(".sam", ".bed")




debug = True;
debug_break_after_one_region = False
debug_break_after_debug_seen = False
debug_output_debug_only = False



CNV_BED

['/Users/drorkessler/miniconda3/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/Users/drorkessler/Library/Jupyter/runtime/kernel-fe640cc7-2b68-4969-a440-20cc6afefe96.json']


'data/chr22.bed'

In [8]:
# open files
import pysam
reads_file = pysam.AlignmentFile(IN_CRAM, "rb", reference_filename=REF_FASTA)
fasta_file = pysam.FastaFile(REF_FASTA)

In [12]:
import subprocess

def run_process(command, input_path):
  command 
  print("command", command) 
    
  try:
    process = subprocess.Popen(command, 
                              stdin=subprocess.PIPE, 
                              stdout=subprocess.PIPE, 
                              stderr=subprocess.PIPE, 
                              text=True) 

    stdout, stderr = process.communicate()
    returncode = process.returncode
    #print(stderr)

  except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")
    return None, e.returncode

  return stdout, returncode

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(...)? (2173560176.py, line 22)

In [13]:
import tempfile

# process a single cnv
def process_cnv(chrom, start, end, type, tfp):
    if debug: print("process_cnv: ", locals())

    # get all reads that cross the two cnv edges
    reads = dict()
    reads_in_ref = [set(), set()]
    refs = []
    refs_extents = []
    ref_id = 0
    for loc in [int(start), int(end)]:
        rmin = loc - FETCH_READ_PADDING
        rmax = loc + FETCH_READ_PADDING
        for read in reads_file.fetch(chrom, loc - FETCH_READ_PADDING, loc + FETCH_READ_PADDING):
            reads[read.qname] = read
            rmin = min(rmin, read.reference_start)
            rmax = max(rmax, read.reference_end)
            reads_in_ref[ref_id].add(read.qname)
        refs_extents.append([rmin, rmax])
        ref_id += 1
    # extend references before and after
    refs_extents[0][0] = max(0, refs_extents[0][0] - FETCH_REF_PADDING)
    refs_extents[1][1] = refs_extents[1][1] + FETCH_REF_PADDING
    if debug: print("found", len(reads), "reads") 
    if debug: print("refs_extents", refs_extents)

    # get references
    for extents in refs_extents:
        rmin, rmax = extents
        ref = fasta_file.fetch(chrom, rmin, rmax)
        refs.append([rmin, ref])
    #if debug: print("refs", refs)

    # create input file for jump aligner
    ref_emitted = False
    reads_in_order = []
    with open(tmp, 'w') as f:
        for read in reads.values():
            reads_in_order.append(read)
            if not ref_emitted:
                line = read.seq + "\t" + refs[0][1] + "\t" + refs[1][1] + "\n"
                ref_emitted = True
            else:
                line = read.seq + "\t=\n"
            f.write(line)
    if debug: print("tmp", tmp)

    # run jump_align
    alignments = run_process(JUMP_ALIGN_CMD, tmp)
    header_seen = False
    realignments = []
    for alignment, read in zip(alignments[0].split("\n"), [None, *reads_in_order]):
        if not header_seen:
            header_seen = True;
        else:
            a = alignment.split("\t")
            in1 = read.qname in reads_in_ref[0]
            in2 = read.qname in reads_in_ref[1]
            realignments.append([read, refs[0][0], refs[1][0], a, in1, in2])
    return realignments


In [14]:
# open output sam file
print("OUT_SAM", OUT_SAM)
sam_file = pysam.AlignmentFile(OUT_SAM, "w", header=reads_file.header)

debug_qname = "036742_2-Z0027-0612964141"
debug_seen = False

# loop on bed file, write output bed file
with open(OUT_BED, "w") as out_bed:
    with open(CNV_BED) as f:
        for line in f:
            bed_line = line.strip().split()
            realignments = process_cnv(*bed_line)
            jump_read_written = 0
            jump_read_best = 0
            for realignment in realignments:
                in_ref = [False, False]
                read, ref1_start, ref2_start, ainfo, in_ref[0], in_ref[1] = realignment
                if read.qname == debug_qname:
                    print("ref_start1/2", ref1_start, ref2_start)
                    print("ainfo", ainfo)
                    print("in_ref", in_ref)
                    debug_seen = True
    
                # write original read
                tags = read.tags
                read.tags = tags + [('JT', 0)]
    
                if not debug_output_debug_only or read.qname == debug_qname:
                    sam_file.write(read)
    
                # decode alignment info
                score, jumpInsertSize, jumpRange, \
                    jbegin1, jcigar1, jreadlen1, jreflen1, \
                    jbegin2, jcigar2, jreadlen2, jreflen2, \
                    score1, begin1, cigar1, readlen1, reflen1, \
                    score2, begin2, cigar2, readlen2, reflen2 = ainfo
    
                # build read aligned to references
                qname = read.qname
                for i in [1,2]:
                    if not in_ref[i-1]:
                        continue
                    read.qname = qname + "_REF" + str(i)
                    read.tags = tags + [('JT', i)]    
                    if i == 1:
                        read.cigarstring = cigar1
                        read.reference_start = ref1_start + int(begin1)
                        read.tags += [('JS', int(score1))]
                    else:
                        read.cigarstring = cigar2
                        read.reference_start = ref2_start + int(begin2)
                        read.tags += [('JS', int(score2))]
                    if not debug_output_debug_only or qname == debug_qname:    
                        sam_file.write(read)
                    
                # build read aligned to both references with a jump in the middle
                if len(jcigar1) and len(jcigar2) \
                        and int(jreadlen1) >= MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT \
                        and int(jreadlen2) >= MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT:
                    end_on_ref1 = ref1_start + int(jbegin1) + int(jreflen1)
                    start_on_ref2 = ref2_start + int(jbegin2)
                    delta = start_on_ref2 - end_on_ref1
                    read.qname = qname + "_JUMP"
                    new_cigar = jcigar1
                    if int(jumpInsertSize):
                        if False:
                            delta -= int(jumpInsertSize)
                        else:
                            new_cigar += jumpInsertSize + "I"
                    if delta > 0:
                        new_cigar += str(delta) + "D"
                    if delta >= 0:
                        new_cigar += jcigar2
                        read.cigarstring = new_cigar
                        read.reference_start = ref1_start + int(jbegin1)
                        read.tags = tags + [('JT', 3)]   
                        read.tags += [('JS', int(score))]
                        if not debug_output_debug_only or qname == debug_qname:    
                            sam_file.write(read)  
                            jump_read_written += 1
                            if int(score) > int(score1) and int(score) > int(score2):
                                jump_read_best += 1
                        if qname == debug_qname:
                            print("jump new_cigar", new_cigar)
    
            if debug: print("jump_read_written", jump_read_written, "jump_read_best", jump_read_best)
            out_bed.write(line[:-1] + "\t" + str(jump_read_written) + "\t" + str(jump_read_best) + "\n")
            
            if debug_break_after_one_region:
                break
            if debug_break_after_debug_seen and debug_seen:
                break

sam_file.close()

# convert to sorted bam
cmd = "samtools sort " + OUT_SAM + " >" + OUT_BAM
print("cmd", cmd)
os.system(cmd)
cmd = "samtools index " + OUT_BAM
print("cmd", cmd)
os.system(cmd)

OUT_SAM /tmp/chr22_cnv_realign.95266.sam
process_cnv:  {'chrom': 'chr22', 'start': '22220500', 'end': '22221500', 'type': 'CN6', 'tfp': 'FP'}
found 336 reads
refs_extents [[22218179, 22221303], [22220666, 22223826]]
tmp /tmp/jump_align_input.95266
command ['docker', 'run', '-i', 'jump_align_dev', '/bin/bash', '-c', 'jump_align 2 -8 -18 -1 -1 0 /tmp/jump_align_input.95266']
jump_read_written 0 jump_read_best 0
process_cnv:  {'chrom': 'chr22', 'start': '22628000', 'end': '22629000', 'type': 'CN3', 'tfp': 'FP'}
found 380 reads
refs_extents [[22625686, 22628815], [22628182, 22631298]]
tmp /tmp/jump_align_input.95266
command ['docker', 'run', '-i', 'jump_align_dev', '/bin/bash', '-c', 'jump_align 2 -8 -18 -1 -1 0 /tmp/jump_align_input.95266']
jump_read_written 0 jump_read_best 0
process_cnv:  {'chrom': 'chr22', 'start': '22630000', 'end': '22631000', 'type': 'CN3', 'tfp': 'FP'}
found 323 reads
refs_extents [[22627695, 22630829], [22630191, 22633314]]
tmp /tmp/jump_align_input.95266
command 

0