In [1]:
# cnv_realign - using detected cnv areas, realign cram file using jump aligner around cnv edges

In [11]:
# argumennt processing + debug environment files
import sys
import os

MODE="DUP"
MATCH_SCORE = 2
MISMATCH_SCORE = -8
OPEN_SCORE = -18
EXTEND_SCORE = -1
JUMP_SCORE = 0

MIN_MISMATCHES = 30
SOFTCLIP_THRESHOLD = 30
FETCH_READ_PADDING = 500
FETCH_REF_PADDING = 0
MIN_SEQ_LEN_JUMP_ALIGN_COMPONENT = 30
MIN_GAP_LEN = 30
MAX_READS_PER_CNV = 4000

GCLOUD_AUTH = False
INPUT_NAME_FULL = False

print(sys.argv)
tmp = "/tmp/jump_align_input." + str(os.getpid())
JUMP_ALIGN_CMD = ["para_jalign", str(MATCH_SCORE), str(MISMATCH_SCORE), str(OPEN_SCORE), str(EXTEND_SCORE), "-1", str(JUMP_SCORE), tmp]

if "dup_cnv_realign" in sys.argv[0] or "stdin" in sys.argv[0]:
    if len(sys.argv) < 5:
        print("usage: " + sys.argv[0] + " <input-cram> <range-bed> <ref-fasta> <output-prefix> [<mode>] [<min-mismatches]\n")
        sys.exit(-1)
    # commandline invocation
    IN_CRAM = sys.argv[1]
    CNV_BED = sys.argv[2]
    REF_FASTA = sys.argv[3]
    OUT_SAM = sys.argv[4] + ".sam"
    if len(sys.argv) >= 6:
        MODE = sys.argv[5]
    if len(sys.argv) >= 7:
        MIN_MISMATCHES = int(sys.argv[6])
else:
    
    REF_FASTA = os.path.expanduser("~/tmp/ref/Homo_sapiens_assembly38.fasta")
    OUT_SAM = "/tmp/cnv_realign." + str(os.getpid()) + ".sam"

    # DUP development
    IN_CRAM = "gs://ug-cromwell-tests/structural_variant/030945-NA24385-Z0114-CAACATACATCAGAT.cram"
    CNV_BED = os.path.expanduser("~/tmp/data/jump_align/251020/dups.bed")

    JUMP_ALIGN_CMD = ["jump_align/" + JUMP_ALIGN_CMD[0]] + JUMP_ALIGN_CMD[1:]

    GCLOUD_AUTH = True
    INPUT_NAME_FULL = True


OUT_BED = OUT_SAM.replace(".sam", ".bed")

CNV_BED

['/Users/drorkessler/miniconda3/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/Users/drorkessler/Library/Jupyter/runtime/kernel-71178e2b-e0de-4d42-9c55-bcaa02a6849f.json']


'/Users/drorkessler/tmp/data/jump_align/251020/dups.bed'

In [12]:
import os
import subprocess

if GCLOUD_AUTH:

    # Run the gcloud command and capture its output
    token = subprocess.check_output(
        ["gcloud", "auth", "print-access-token"], text=True
    ).strip()
    
    # Set the environment variable for the current process
    os.environ["GCS_OAUTH_TOKEN"] = token
    
    # Optional: print confirmation
    print("GCS_OAUTH_TOKEN set to:", token[:10] + "...")

GCS_OAUTH_TOKEN set to: ya29.a0ATi...


In [13]:
# open files
import pysam
reads_file = pysam.AlignmentFile(IN_CRAM, "rb", reference_filename=REF_FASTA)
fasta_file = pysam.FastaFile(REF_FASTA)

In [14]:
import subprocess

def run_process(command, input_path):
  command 
  print("command", " ".join(command)) 
    
  try:
    process = subprocess.Popen(command, 
                              stdin=subprocess.PIPE, 
                              stdout=subprocess.PIPE, 
                              stderr=subprocess.PIPE, 
                              text=True) 

    stdout, stderr = process.communicate()
    returncode = process.returncode
    #print(stderr)

  except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")
    return None, e.returncode

  return stdout, returncode

In [15]:
import re

def count_md_mismatches(read):
    """
    Count the number of mismatches in a pysam.AlignedSegment read using the MD tag.
    Indels are not counted.
    """
    try:
        md_tag = read.get_tag("MD")
    except KeyError:
        return None

    # Find all letters in the MD string, which represent mismatches
    mismatches = re.findall(r"[A-Z]", md_tag)
    return len(mismatches)

def count_nm_mismatches(read):
    """
    Count the number of mismatches in a pysam.AlignedSegment read using the NM tag.
    Indels are counted.
    """
    try:
        nm_tag = read.get_tag("NM")
    except KeyError:
        return None

    return int(nm_tag)

def count_softclip_mismatches(read, reference):
    """
    Count mismatches in soft-clipped regions (both left and right) of a read.
    `read` is a pysam.AlignedSegment
    `reference` is a pysam.FastaFile
    """
    if read.is_unmapped:
        return 0

    seq = read.query_sequence
    mismatches = 0
    ref_name = read.reference_name
    start = read.reference_start
    end = read.reference_end

    cigartuples = read.cigartuples
    # CIGAR operations
    SOFT_CLIP = 4

    # Left soft clip
    if cigartuples[0][0] == SOFT_CLIP:
        clip_len = cigartuples[0][1]
        clipped_bases = seq[:clip_len]
        ref_start = max(0, start - clip_len)
        ref_bases = reference.fetch(ref_name, ref_start, start)
        for rb, qb in zip(ref_bases, clipped_bases):
            if rb.upper() != qb.upper():
                mismatches += 1

    # Right soft clip
    if cigartuples[-1][0] == SOFT_CLIP:
        clip_len = cigartuples[-1][1]
        clipped_bases = seq[-clip_len:]
        ref_bases = reference.fetch(ref_name, end, end + clip_len)
        for rb, qb in zip(ref_bases, clipped_bases):
            if rb.upper() != qb.upper():
                mismatches += 1

    return mismatches

md_values = []
nm_values = []
sc_values = []

def accept_read(read):
    if MIN_MISMATCHES <= 0:
        return True
    sc = count_softclip_mismatches(read, fasta_file);
    nm = count_nm_mismatches(read)
    return (sc + nm) >= MIN_MISMATCHES


In [16]:
import tempfile
import random

random.seed(0)

# is read soft clipped
def is_softclipped(read):
    return read.cigartuples[0][0] == pysam.CSOFT_CLIP or read.cigartuples[-1][0] == pysam.CSOFT_CLIP

def is_substential_softclipped(read):
    return (read.cigartuples[0][0] == pysam.CSOFT_CLIP and read.cigartuples[0][1]) >= SOFTCLIP_THRESHOLD \
                or (read.cigartuples[-1][0] == pysam.CSOFT_CLIP and read.cigartuples[-1][1] >= SOFTCLIP_THRESHOLD)

# process a single cnv
def process_cnv(chrom, start, end, mode):

    # get all reads that cross the two cnv edges
    reads = dict()
    reads_in_ref = [set(), set()]
    refs = []
    refs_extents = []
    ref_id = 0
    for loc in [start, end]:
        rmin = max(0, loc - FETCH_READ_PADDING)
        rmax = loc + FETCH_READ_PADDING
        for read in reads_file.fetch(chrom, max(0, loc - FETCH_READ_PADDING), loc + FETCH_READ_PADDING):
            if mode == "DUP" and (not is_substential_softclipped(read) and not accept_read(read)) :
                continue
            #if mode == "DUP" and read.is_supplementary:
            #    continue
            reads[read.qname] = read
            rmin = min(rmin, read.reference_start)
            rmax = max(rmax, read.reference_end)
            reads_in_ref[ref_id].add(read.qname)
        refs_extents.append([rmin, rmax])
        ref_id += 1
        
    # extend references before and after
    refs_extents[0][0] = max(0, refs_extents[0][0] - FETCH_REF_PADDING)
    refs_extents[1][1] = refs_extents[1][1] + FETCH_REF_PADDING

    # get references
    for extents in refs_extents:
        rmin, rmax = extents
        ref = fasta_file.fetch(chrom, rmin, rmax)
        refs.append([rmin, ref])

    # create input file for jump aligner
    ref_emitted = False
    reads_in_order = []
    subsample_ratio = 1.0
    if len(reads) > MAX_READS_PER_CNV:
        subsample_ratio = MAX_READS_PER_CNV / len(reads)
        print("subsample_ratio", subsample_ratio)
    nlines = 0
    jalign_input = tmp
    if INPUT_NAME_FULL:
        jalign_input += "_" + chrom + ":" + str(start) + "-" + str(end)
    with open(jalign_input, 'w') as f:
        for read in reads.values():
            if subsample_ratio < 1.0:
                if random.random() > subsample_ratio:
                    continue
            if not accept_read(read):
                continue
            reads_in_order.append(read)
            if not ref_emitted:
                line = read.qname + "\t" + read.seq + "\t" + refs[1][1] + "\t" + refs[0][1] + "\n"
                ref_emitted = True
            else:
                line = read.qname + "\t" + read.seq + "\t=\n"
            f.write(line)
            nlines += 1

    # run jump_align
    JUMP_ALIGN_CMD[-1] = jalign_input
    alignments = run_process(JUMP_ALIGN_CMD, jalign_input)
    header_seen = False
    realignments = []
    rheader = []
    for alignment, read in zip(alignments[0].split("\n"), [None, *reads_in_order]):
        if not header_seen:
            rheader = alignment.split("\t")
            header_seen = True;
        else:
            a = alignment.split("\t")
            in1 = read.qname in reads_in_ref[0]
            in2 = read.qname in reads_in_ref[1]
            realignments.append([read, refs[0][0], refs[1][0], a, in1, in2])
    return (rheader, realignments, nlines)


In [17]:
import statistics

chrom_sizes = dict(zip(fasta_file.references, fasta_file.lengths))

# loop on bed file, write output bed file
nlines_total = 0
with open(OUT_BED, "w") as out_bed:
    with open(CNV_BED) as f:
        for line in f:
            if line.startswith("#"):
                continue
            bed_line = line.strip().split()
            bed_chrom, bed_start, bed_end = bed_line[:3]
            bed_start = int(bed_start)
            bed_end = int(bed_end)
            #check for valid cnv
            if bed_end + FETCH_READ_PADDING > chrom_sizes[bed_chrom]:
                continue
            rheader, realignments, nlines = process_cnv(bed_chrom, bed_start, bed_end, MODE)
            nlines_total += nlines
            jump_better = 0

            for realignment in realignments:
                in_ref = [False, False]
                read, ref1_start, ref2_start, ainfo, in_ref[0], in_ref[1] = realignment
    
                # decode alignment info
                qname, better, jscore, score1, score2, jgain, size1, size2 = ainfo
                better = int(better)
                size1 = int(size1)
                size2 = int(size2)
                if size1 < 20 or size2 < 20:
                    continue
                if better:
                    jump_better += 1
    
            out_bed.write(line[:-1] + ("\t%d\n" % (jump_better)))
            

print("nlines_total", nlines_total)
OUT_BED

command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr1:206052502-206058501
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr1:209904501-209920001
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr2:97194502-97204501
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr2:97208001-97234001
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr2:122526502-122537001
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr5:21478502-21497001
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr5:175931002-175943001
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr8:57203502-57215501
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr9:133060501-133081001
command jump_align/para_jalign 2 -8 -18 -1 -1 0 /tmp/jump_align_input.91238_chr11:18920502-18944001

'/tmp/cnv_realign.91238.bed'