In [8]:
from tools.transcripts import *
from tools.psl import *
from tools.nameConversions import *

In [2]:
txs = get_gene_pred_dict("/public/groups/cgl/cat/primates_evan/v2/work/transMap/Bonobo.filtered.gp")

In [35]:
alns = get_alignment_dict("/public/groups/cgl/cat/primates_evan/v2/work/transcript_alignment/Bonobo.transMap.mRNA.psl")

In [36]:
def find_indels(tx, psl, aln_mode):
    """
    Walks the psl alignment looking for alignment gaps. Reports all such gaps in Chromosome Coordinates, marking
    the type of gap (CodingInsertion, CodingMult3Insertion, CodingDeletion, CodingMult3Deletion)

    Insertion/Deletion is relative to the target genome, for example:

    CodingInsertion:
    ref: ATGC--ATGC
    tgt: ATGCGGATGC

    CodingDeletion:
    ref: ATGCGGATGC
    tgt: ATGC--ATGC

    :param tx: GenePredTranscript object representing the target transcript
    :param psl: PslRow object describing CDS alignment between ref_tx and tx.
    :param aln_mode: One of ('CDS', 'mRNA'). Determines if we aligned CDS or mRNA.
    :return: list of bed12-format lists
    """
    def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand):
        """convert alignment coordinates to target chromosome coordinates, inverting if negative strand"""
        left_chrom_pos = coordinate_fn(left_pos)
        assert left_chrom_pos is not None
        right_chrom_pos = coordinate_fn(right_pos)
        if right_chrom_pos is None:
            right_chrom_pos = coordinate_fn(right_pos - 1)
            if strand == '-':
                left_chrom_pos += 1
            else:
                left_chrom_pos -= 1
        assert right_chrom_pos is not None
        if strand == '-':
            left_chrom_pos, right_chrom_pos = right_chrom_pos, left_chrom_pos
        assert right_chrom_pos >= left_chrom_pos
        return left_chrom_pos, right_chrom_pos

    def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type):
        """Converts either an insertion or a deletion into a output transcript"""
        left_chrom_pos, right_chrom_pos = convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn,
                                                                            tx.strand)
        if left_chrom_pos is None or right_chrom_pos is None:
            assert aln_mode == 'CDS'
            return None

        if left_chrom_pos > tx.thick_start and right_chrom_pos < tx.thick_stop:
            indel_type = 'CodingMult3' if offset % 3 == 0 else 'Coding'
        else:
            indel_type = 'NonCoding'

        new_bed = tx.get_bed(new_start=left_chrom_pos, new_stop=right_chrom_pos, rgb=offset,
                             name=''.join([indel_type, gap_type]))
        return [tx.name] + new_bed

    # depending on mode, we convert the coordinates from either CDS or mRNA
    # we also have a different position cutoff to make sure we are not evaluating terminal gaps
    if aln_mode == 'CDS':
        coordinate_fn = tx.cds_coordinate_to_chromosome
    else:
        coordinate_fn = tx.mrna_coordinate_to_chromosome

    # r holds the output
    r = []

    # remember where we were last iteration
    q_pos = 0
    t_pos = 0
    # iterate over block starts[i], q_starts[i + 1], t_starts[i + 1]
    for block_size, q_start, t_start in zip(*[psl.block_sizes, psl.q_starts[1:], psl.t_starts[1:]]):
        q_offset = q_start - block_size - q_pos
        t_offset = t_start - block_size - t_pos
        assert (q_offset >= 0 and t_offset >= 0)
        if q_offset != 0:  # query insertion -> insertion in target sequence
            left_pos = q_start - q_offset
            right_pos = q_start
            row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, 'Insertion')
            if row is not None:
                r.append(row)
        if t_offset != 0:  # target insertion -> insertion in reference sequence
            if tx.strand == '+':
                left_pos = right_pos = q_start
            else:
                left_pos = right_pos = psl.q_size - q_start
            row = parse_indel(left_pos, right_pos, coordinate_fn, tx, t_offset, 'Deletion')
            if row is not None:
                r.append(row)
        q_pos = q_start
        t_pos = t_start
    return r


In [37]:
indels = []
for aug_id, aln in alns.items():
    tx_id = remove_augustus_alignment_number(aug_id)
    tx = txs[tx_id]
    indels.append(find_indels(tx, aln, "mRNA"))

In [30]:
psl = aln
aln_mode = "mRNA"

def convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn, strand):
    """convert alignment coordinates to target chromosome coordinates, inverting if negative strand"""
    left_chrom_pos = coordinate_fn(left_pos)
    assert left_chrom_pos is not None
    right_chrom_pos = coordinate_fn(right_pos)
    if right_chrom_pos is None:
        right_chrom_pos = coordinate_fn(right_pos - 1)
        if strand == '-':
            left_chrom_pos += 1
        else:
            left_chrom_pos -= 1
    assert right_chrom_pos is not None
    if strand == '-':
        left_chrom_pos, right_chrom_pos = right_chrom_pos, left_chrom_pos
    assert right_chrom_pos >= left_chrom_pos
    return left_chrom_pos, right_chrom_pos

def parse_indel(left_pos, right_pos, coordinate_fn, tx, offset, gap_type):
    """Converts either an insertion or a deletion into a output transcript"""
    left_chrom_pos, right_chrom_pos = convert_coordinates_to_chromosome(left_pos, right_pos, coordinate_fn,
                                                                        tx.strand)
    if left_chrom_pos is None or right_chrom_pos is None:
        assert aln_mode == 'CDS'
        return None

    if left_chrom_pos > tx.thick_start and right_chrom_pos < tx.thick_stop:
        indel_type = 'CodingMult3' if offset % 3 == 0 else 'Coding'
    else:
        indel_type = 'NonCoding'

    new_bed = tx.get_bed(new_start=left_chrom_pos, new_stop=right_chrom_pos, rgb=offset,
                         name=''.join([indel_type, gap_type]))
    return [tx.name] + new_bed

# depending on mode, we convert the coordinates from either CDS or mRNA
# we also have a different position cutoff to make sure we are not evaluating terminal gaps
if aln_mode == 'CDS':
    coordinate_fn = tx.cds_coordinate_to_chromosome
else:
    coordinate_fn = tx.mrna_coordinate_to_chromosome

# r holds the output
r = []

# remember where we were last iteration
q_pos = 0
t_pos = 0
# iterate over block starts[i], q_starts[i + 1], t_starts[i + 1]
for block_size, q_start, t_start in zip(*[psl.block_sizes, psl.q_starts[1:], psl.t_starts[1:]]):
    q_offset = q_start - block_size - q_pos
    t_offset = t_start - block_size - t_pos
    assert (q_offset >= 0 and t_offset >= 0)
    if q_offset != 0:  # query insertion -> insertion in target sequence
        left_pos = q_start - q_offset
        right_pos = q_start
        row = parse_indel(left_pos, right_pos, coordinate_fn, tx, q_offset, 'Insertion')
        if row is not None:
            r.append(row)
    if t_offset != 0:  # target insertion -> insertion in reference sequence
        if tx.strand == '+':
            left_pos = right_pos = q_start
        row = parse_indel(left_pos, right_pos, coordinate_fn, tx, t_offset, 'Deletion')
        if row is not None:
            r.append(row)
    q_pos = q_start
    t_pos = t_start

AssertionError: 

In [31]:
left_pos, right_pos

(10993, 10993)

In [32]:
psl.q_size, psl.t_size, tx.cds_size

(11084, 10969, 9632)

In [33]:
psl.psl_string()

['9779',
 '67',
 '0',
 '0',
 '3',
 '1238',
 '6',
 '39',
 '+',
 'augTM-ENST00000011700.10-0',
 '11084',
 '1231',
 '11084',
 'ENST00000011700.10',
 '10969',
 '0',
 '9885',
 '130',
 '12,1,10,1,94,1,294,1,107,1,13,1,283,1,117,1,371,1,163,1,228,1,190,1,129,1,226,1,72,1,609,1,61,1,13,2,218,1,122,1,71,1,380,1,206,1,436,1,225,1,242,1,35,1,62,1,188,1,245,1,923,1,35,1,431,1,218,1,35,1,80,1,1235,1,771,1,316,1,137,1,4,1,52,1,9,1,50,2,2,3,1,3,5,2,3,2,1,1,1,1,6,1,1,3,1,1,2,1,1,1,9,1,1,1,2,2,2,1,5,1,1,1,1,2,1,2,2,1,1,2',
 '1231,1243,1244,1254,1255,1349,1350,1644,1645,1752,1753,1766,1767,2050,2051,2168,2169,2540,2541,2704,2705,2933,2934,3124,3125,3254,3255,3481,3482,3554,3555,4164,4165,4226,4227,4240,4242,4460,4461,4583,4584,4655,4656,5036,5037,5243,5244,5680,5681,5906,5907,6149,6150,6185,6186,6248,6249,6437,6438,6683,6684,7607,7608,7643,7644,8075,8076,8294,8295,8330,8331,8411,8412,9647,9648,10419,10420,10736,10737,10874,10875,10879,10880,10932,10933,10942,10943,10993,10995,10997,11000,11001,11004,110

In [44]:
for x in indels[:20]:
    for i in x:
        print(f"{i[1]}:{i[2]}-{i[3]}")

chr12:9026662-9026663
chr12:9026467-9026469
chr12:9034597-9034597
chr11:59673481-59673481
chr12:2924539-2924540
chr12:2924684-2924685
chr12:2925506-2925507
chr2a:72191457-72191458
chr2a:72175309-72175309
chr2a:72175519-72175519
chr2a:37276581-37276581
chr6:141271130-141271130
chr6:141271367-141271369
chr16:70777200-70777200
chr4:5793801-5793801
chr4:5795593-5795593
chr4:5797122-5797122
chr4:5797439-5797439
chr3:50116574-50116575
chr3:50116619-50116619
chr7:109631556-109631557
chr7:109632182-109632183
chr7:109632254-109632255
chr7:109632502-109632502
chr7:109632710-109632711
chr7:84121227-84121227
chr7:84124328-84124328
chr7:84125504-84125504
chr7:84125586-84125586
chr7:84126882-84126882
chr11:108676849-108676858
chr11:108676762-108676767
chr1:23561287-23561287
chr1:23560233-23560239
chr1:23616512-23616512
chr1:23673878-23673883
chr1:23673948-23673949
chr1:23674194-23674194
chr1:23674580-23674580
chr1:23675006-23675009
chr1:23675794-23675795
chr1:23675910-23675910
chr7:142387198-1423871

In [45]:
indels[:20]

[[],
 [['ENST00000000412.8-0',
   'chr12',
   '9026662',
   '9026663',
   'NonCodingInsertion',
   '0',
   '-',
   '0',
   '0',
   '1',
   '1',
   '1',
   '0'],
  ['ENST00000000412.8-0',
   'chr12',
   '9026467',
   '9026469',
   'NonCodingInsertion',
   '0',
   '-',
   '0',
   '0',
   '2',
   '1',
   '2',
   '0'],
  ['ENST00000000412.8-0',
   'chr12',
   '9034597',
   '9034597',
   'NonCodingDeletion',
   '0',
   '-',
   '9034597',
   '9034597',
   '1',
   '1',
   '0',
   '0']],
 [['ENST00000000442.11-0',
   'chr11',
   '59673481',
   '59673481',
   'NonCodingDeletion',
   '0',
   '+',
   '59673481',
   '59673481',
   '1',
   '1',
   '0',
   '0']],
 [['ENST00000001008.6-0',
   'chr12',
   '2924539',
   '2924540',
   'NonCodingInsertion',
   '0',
   '+',
   '0',
   '0',
   '1',
   '1',
   '1',
   '0'],
  ['ENST00000001008.6-0',
   'chr12',
   '2924684',
   '2924685',
   'NonCodingInsertion',
   '0',
   '+',
   '0',
   '0',
   '1',
   '1',
   '1',
   '0'],
  ['ENST00000001008.6-0',
   '