In [13]:
# cnv_realign - using detected cnv areas, realign cram file using jump aligner around cnv edges

In [14]:
# argumennt processing + debug environment files
import sys
import os
if "cnv_realign" in sys.argv[0]:
    # commandline invocation
    IN_CRAM = sys.argv[1]
    CNV_BED = sys.argv[2]
    REF_FASTA = sys.argv[3]
    OUT_SAM = sys.argv[4]
else:
    IN_CRAM = "data/chr22_1M.cram"
    CNV_BED = "data/chr22.bed"
    REF_FASTA = os.path.expanduser("~/tmp/ref/Homo_sapiens_assembly38.fasta")
    OUT_SAM = "/tmp/chr22_cnv_realign." + str(os.getpid()) + ".sam"

FETCH_READ_PADDING = 500
FETCH_REF_PADDING = 500
MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT = 30

OUT_BAM = OUT_SAM.replace(".sam", ".bam")


JUMP_ALIGN_CMD = ["docker", "run", "-i", "jump_align_dev", "/bin/bash", "-c", "jump_align 10 -25 -50 -10 -5 0"]

debug = True;
debug_break_after_one_region = False
debug_break_after_debug_seen = False
debug_output_debug_only = False



CNV_BED

'data/chr22.bed'

In [15]:
# open files
import pysam
reads_file = pysam.AlignmentFile(IN_CRAM, "rb")
fasta_file = pysam.FastaFile(REF_FASTA)

In [16]:
import subprocess

def run_process(command, input_path):

  with open(input_path, 'r') as file:
    input_data = file.read()  
    
  try:
    process = subprocess.Popen(command, 
                              stdin=subprocess.PIPE, 
                              stdout=subprocess.PIPE, 
                              stderr=subprocess.PIPE, 
                              text=True) 
    if input_data:
      process.stdin.write(input_data)
      #process.stdin.close()

    stdout, stderr = process.communicate()
    returncode = process.returncode
    print(stderr)

  except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")
    return None, e.returncode

  return stdout, returncode

In [17]:
import tempfile

# process a single cnv
def process_cnv(chrom, start, end, type, tfp):
    if debug: print("process_cnv: ", locals())

    # get all reads that cross the two cnv edges
    reads = dict()
    reads_in_ref = [set(), set()]
    refs = []
    refs_extents = []
    ref_id = 0
    for loc in [int(start), int(end)]:
        rmin = loc - FETCH_READ_PADDING
        rmax = loc + FETCH_READ_PADDING
        for read in reads_file.fetch(chrom, loc - FETCH_READ_PADDING, loc + FETCH_READ_PADDING):
            reads[read.qname] = read
            rmin = min(rmin, read.reference_start)
            rmax = max(rmax, read.reference_end)
            reads_in_ref[ref_id].add(read.qname)
        refs_extents.append([rmin, rmax])
        ref_id += 1
    # extend references before and after
    refs_extents[0][0] = max(0, refs_extents[0][0] - FETCH_REF_PADDING)
    refs_extents[1][1] = refs_extents[1][1] + FETCH_REF_PADDING
    if debug: print("found", len(reads), "reads") 
    if debug: print("refs_extents", refs_extents)

    # get references
    for extents in refs_extents:
        rmin, rmax = extents
        ref = fasta_file.fetch(chrom, rmin, rmax)
        refs.append([rmin, ref])
    #if debug: print("refs", refs)

    # create input file for jump aligner
    tmp = "/tmp/jump_align_input." + str(os.getpid())
    ref_emitted = False
    reads_in_order = []
    with open(tmp, 'w') as f:
        for read in reads.values():
            reads_in_order.append(read)
            if not ref_emitted:
                line = read.seq + "\t" + refs[0][1] + "\t" + refs[1][1] + "\n"
                ref_emitted = True
            else:
                line = read.seq + "\t=\n"
            f.write(line)
    if debug: print("tmp", tmp)

    # run jump_align
    alignments = run_process(JUMP_ALIGN_CMD, tmp)
    header_seen = False
    realignments = []
    for alignment, read in zip(alignments[0].split("\n"), [None, *reads_in_order]):
        if not header_seen:
            header_seen = True;
        else:
            a = alignment.split("\t")
            in1 = read.qname in reads_in_ref[0]
            in2 = read.qname in reads_in_ref[1]
            realignments.append([read, refs[0][0], refs[1][0], a, in1, in2])
    return realignments


In [18]:
# open output sam file
print("OUT_SAM", OUT_SAM)
sam_file = pysam.AlignmentFile(OUT_SAM, "w", header=reads_file.header)

debug_qname = "036742_2-Z0027-1180875374"
debug_seen = False

# loop on bed file
with open(CNV_BED) as f:
    for line in f:
        bed_line = line.strip().split()
        realignments = process_cnv(*bed_line)
        for realignment in realignments:
            in_ref = [False, False]
            read, ref1_start, ref2_start, ainfo, in_ref[0], in_ref[1] = realignment
            if read.qname == debug_qname:
                print("ref_start1/2", ref1_start, ref2_start)
                print("ainfo", ainfo)
                print("in_ref", in_ref)
                debug_seen = True

            # write original read
            tags = read.tags
            read.tags = tags + [('JT', 0)]

            if not debug_output_debug_only or read.qname == debug_qname:
                sam_file.write(read)

            # decode alignment info
            score, jumpInsertSize, jumpRange, \
                jbegin1, jcigar1, jreadlen1, jreflen1, \
                jbegin2, jcigar2, jreadlen2, jreflen2, \
                score1, begin1, cigar1, readlen1, reflen1, \
                score2, begin2, cigar2, readlen2, reflen2 = ainfo

            # build read aligned to references
            qname = read.qname
            for i in [1,2]:
                if not in_ref[i-1]:
                    continue
                read.qname = qname + "_REF" + str(i)
                read.tags = tags + [('JT', i)]    
                if i == 1:
                    read.cigarstring = cigar1
                    read.reference_start = ref1_start + int(begin1)
                    read.tags += [('JS', int(score1))]
                else:
                    read.cigarstring = cigar2
                    read.reference_start = ref2_start + int(begin2)
                    read.tags += [('JS', int(score2))]
                if not debug_output_debug_only or qname == debug_qname:    
                    sam_file.write(read)
                
            # build read aligned to both references with a jump in the middle
            if len(jcigar1) and len(jcigar2) \
                    and int(jreadlen1) >= MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT \
                    and int(jreadlen2) >= MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT:
                end_on_ref1 = ref1_start + int(jbegin1) + int(jreflen1)
                start_on_ref2 = ref2_start + int(jbegin2)
                delta = start_on_ref2 - end_on_ref1
                read.qname = qname + "_JUMP"
                new_cigar = jcigar1
                if int(jumpInsertSize):
                    new_cigar += jumpInsertSize + "I"
                if delta > 0:
                    new_cigar += str(delta) + "D"
                new_cigar += jcigar2
                read.cigarstring = new_cigar
                read.reference_start = ref1_start + int(jbegin1)
                read.tags = tags + [('JT', 3)]   
                read.tags += [('JS', int(score))]
                if not debug_output_debug_only or qname == debug_qname:    
                    sam_file.write(read)  
                if qname == debug_qname:
                    print("jump new_cigar", new_cigar)

        if debug_break_after_one_region:
            break
        if debug_break_after_debug_seen and debug_seen:
            break

sam_file.close()

# convert to sorted bam
cmd = "samtools sort " + OUT_SAM + " >" + OUT_BAM
print("cmd", cmd)
os.system(cmd)
cmd = "samtools index " + OUT_BAM
print("cmd", cmd)
os.system(cmd)

OUT_SAM /tmp/chr22_cnv_realign.6507.sam
process_cnv:  {'chrom': 'chr22', 'start': '22220500', 'end': '22221500', 'type': 'CN6', 'tfp': 'FP'}
found 336 reads
refs_extents [[22219179, 22221303], [22220666, 22222826]]
tmp /tmp/jump_align_input.6507

ref_start1/2 22219179 22220666
ainfo ['100', '8', '0', '0', '178S1=1X4=1X1=2X3=', '191', '13', '2', '108=', '108', '108', '-680', '1373', '1=14I3=3I1D2=1X4=2I4=2I4=3X1=1X1=3D4=1I4=21I2=3X2=1X3=1X2=1X3=1X1=2I1=1X1=3X2=7I2=2X1=1X1=1X1=2X2=3X2=11I3=1X1=4I4=6I3=2X1=1X2=19I2D2=1X3=3D1=1X1=1X108=', '307', '224', '80', '0', '197S1=1X108=', '307', '110']
in_ref [True, False]
jump new_cigar 178S1=1X4=1X1=2X3=8I1476D108=
process_cnv:  {'chrom': 'chr22', 'start': '22628000', 'end': '22629000', 'type': 'CN3', 'tfp': 'FP'}
found 380 reads
refs_extents [[22626686, 22628815], [22628182, 22630298]]
tmp /tmp/jump_align_input.6507

process_cnv:  {'chrom': 'chr22', 'start': '22630000', 'end': '22631000', 'type': 'CN3', 'tfp': 'FP'}
found 323 reads
refs_extents [

0