In [215]:
# cnv_realign - using detected cnv areas, realign cram file using jump aligner around cnv edges

In [216]:
# argumennt processing + debug environment files
import sys
import os
if "cnv_realign" in sys.argv[0]:
    # commandline invocation
    IN_CRAM = sys.argv[1]
    CNV_BED = sys.argv[2]
    REF_FASTA = sys.argv[3]
    OUT_SAM = sys.argv[4]
else:
    IN_CRAM = "data/chr22_1M.cram"
    CNV_BED = "data/chr22.bed"
    REF_FASTA = os.path.expanduser("~/tmp/ref/Homo_sapiens_assembly38.fasta")
    OUT_SAM = "/tmp/chr22_cnv_realign." + str(os.getpid()) + ".sam"

debug = True;
FETCH_READ_PADDING = 500
FETCH_REF_PADDING = 500

OUT_BAM = OUT_SAM.replace(".sam", ".bam")


JUMP_ALIGN_CMD = ["docker", "run", "-i", "jump_align_dev", "/bin/bash", "-c", "jump_align 10 -25 -50 -10 -5 0"]

CNV_BED

'data/chr22.bed'

In [217]:
# open files
import pysam
reads_file = pysam.AlignmentFile(IN_CRAM, "rb")
fasta_file = pysam.FastaFile(REF_FASTA)

In [218]:
import subprocess

def run_process(command, input_path):

  with open(input_path, 'r') as file:
    input_data = file.read()  
    
  try:
    process = subprocess.Popen(command, 
                              stdin=subprocess.PIPE, 
                              stdout=subprocess.PIPE, 
                              stderr=subprocess.PIPE, 
                              text=True) 
    if input_data:
      process.stdin.write(input_data)
      #process.stdin.close()

    stdout, stderr = process.communicate()
    returncode = process.returncode
    print(stderr)

  except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")
    return None, e.returncode

  return stdout, returncode

In [219]:
import tempfile

# process a single cnv
def process_cnv(chrom, start, end, type):
    if debug: print("process_cnv: ", locals())

    # get all reads that cross the two cnv edges
    reads = dict()
    refs = []
    for loc in [int(start), int(end)]:
        ref = fasta_file.fetch(chrom, loc - FETCH_REF_PADDING, loc + FETCH_REF_PADDING)
        refs.append([loc - FETCH_REF_PADDING, ref])
        for read in reads_file.fetch(chrom, loc - FETCH_READ_PADDING, loc + FETCH_READ_PADDING):
            reads[read.qname] = read
    if debug: print("found", len(reads), "reads") 
    #if debug: print("refs", refs)

    # create input file for jump aligner
    tmp = "/tmp/jump_align_input." + str(os.getpid())
    ref_emitted = False
    reads_in_order = []
    with open(tmp, 'w') as f:
        for read in reads.values():
            reads_in_order.append(read)
            if not ref_emitted:
                line = read.seq + "\t" + refs[0][1] + "\t" + refs[1][1] + "\n"
                ref_emitted = True
            else:
                line = read.seq + "\t=\n"
            f.write(line)
    if debug: print("tmp", tmp)

    # run jump_align
    alignments = run_process(JUMP_ALIGN_CMD, tmp)
    header_seen = False
    realignments = []
    for alignment, read in zip(alignments[0].split("\n"), [None, *reads_in_order]):
        if not header_seen:
            header_seen = True;
        else:
            a = alignment.split("\t")
            realignments.append([read, refs[0][0], refs[1][0], a])
    return realignments


In [224]:
# open output sam file
print("OUT_SAM", OUT_SAM)
sam_file = pysam.AlignmentFile(OUT_SAM, "w", header=reads_file.header)

debug_qname = "036742_1-Z0027-3443492434"

# loop on bed file
with open(CNV_BED) as f:
    for line in f:
        bed_line = line.strip().split()
        realignments = process_cnv(*bed_line)
        for realignment in realignments:
            read, ref1_start, ref2_start, ainfo = realignment
            if read.qname == debug_qname:
                print("ref_start1/2", ref1_start, ref2_start)
                print("ainfo", ainfo)

            # write original read
            tags = read.tags
            read.tags = tags + [('JT', 0)]

            sam_file.write(read)

            # decode alignment info
            score, jumpInsertSize, jumpRange, \
                jbegin1, jcigar1, jreadlen1, jreflen1, \
                jbegin2, jcigar2, jreadlen2, jreflen2, \
                score1, begin1, cigar1, readlen1, reflen1, \
                score2, begin2, cigar2, readlen2, reflen2 = ainfo

            # build read aligned to references
            qname = read.qname
            for i in [1,2]:
                read.qname = qname + "_REF" + str(i)
                if i == 1:
                    read.cigarstring = cigar1
                    read.reference_start = ref1_start + int(begin1)
                else:
                    read.cigarstring = cigar2
                    read.reference_start = ref2_start + int(begin2)
                read.tags = tags + [('JT', i)]    
                sam_file.write(read)
                
            # build read aligned to both references with a jump in the middle
            end_on_ref1 = ref1_start + int(jbegin1) + int(jreflen1)
            start_on_ref2 = ref2_start + int(jbegin2)
            delta = start_on_ref2 - end_on_ref1
            read.qname = qname + "_JUMP"
            new_cigar = jcigar1
            if int(jumpInsertSize):
                new_cigar += jumpInsertSize + "I"
            new_cigar += str(delta) + "D"
            new_cigar += jcigar2
            read.cigarstring = new_cigar
            read.reference_start = ref1_start + int(jbegin1)
            read.tags = tags + [('JT', 3)]    
            sam_file.write(read)            

sam_file.close()

# convert to sorted bam
cmd = "samtools sort " + OUT_SAM + " >" + OUT_BAM
print("cmd", cmd)
os.system(cmd)
cmd = "samtools index " + OUT_BAM
print("cmd", cmd)
os.system(cmd)

OUT_SAM /tmp/chr22_cnv_realign.78608.sam
process_cnv:  {'chrom': 'chr22', 'start': '10510000', 'end': '10512000', 'type': 'CN1'}
found 42 reads
tmp /tmp/jump_align_input.78608

ref_start1/2 10509500 10511500
ainfo ['420', '3', '0', '846', '9=1X85=1D2=1X55=', '153', '154', '993', '4=2I1D2=186S', '194', '7', '415', '846', '9=1X85=1D2=1X55=197S', '350', '154', '-1735', '999', '1=349S', '350', '1']
process_cnv:  {'chrom': 'chr22', 'start': '10513000', 'end': '10514000', 'type': 'CN0'}
found 59 reads
tmp /tmp/jump_align_input.78608

process_cnv:  {'chrom': 'chr22', 'start': '10514500', 'end': '10517000', 'type': 'CN1'}
found 88 reads
tmp /tmp/jump_align_input.78608

process_cnv:  {'chrom': 'chr22', 'start': '10517500', 'end': '10518500', 'type': 'CN0'}
found 125 reads
tmp /tmp/jump_align_input.78608

process_cnv:  {'chrom': 'chr22', 'start': '10522500', 'end': '10524500', 'type': 'CN0'}
found 158 reads
tmp /tmp/jump_align_input.78608

process_cnv:  {'chrom': 'chr22', 'start': '10526000', 'e

0