In [7]:
import filter_junctions
import intron_comparison
import intron_sequences

## RNA-seq data has been aligned to *Euglena longa* genome with the help of two programs - HISAT2 and STAR. Then the introns were extracted using RegTools.

### First let's have a look at basic statistics:

In [51]:
programs = ['hisat', 'star']
for program in programs:
    print('Program: ', program)
    file_in = f'data/junctions_{program}.bed'
    file_out = f'data/good_junctions_{program}.bed'
    filter_junctions.junctions_to_introns(file_in, file_out)
    filter_junctions.intron_stats(file_out)
    print('\n')

Program:  hisat
Number of introns: 197019
Mean support: 59.74485709500099
Median support: 9


Program:  star
Number of introns: 218640
Mean support: 44.03127515550677
Median support: 6.0




### Because of errors in data and in alignments, there are multiple introns overlapping. We want to choose only one introns in every position that has the strongest support.

In [34]:
for program in programs:
    print('Program: ', program)
    cutoff = 1 # First we look at all the best introns

    file_in = f'data/good_junctions_{program}.bed'
    file_out = f'data/best_introns_{program}{cutoff}.bed'
    all_i, best_i = filter_junctions.choose_best_introns(file_in, file_out, cutoff)
    filter_junctions.intron_stats(file_out)

    filter_junctions.compare_best_introns(all_i, best_i)

Program:  hisat
Number of introns: 121603
Mean support: 87.48627912140326
Median support: 17
Mean share of the best intron:  0.8219934856545911
Share of best introns in all:  0.48993422067625914
Mean support of the best intron:  96.79754611317155


Program:  star
Number of introns: 121693
Mean support: 72.18693762172023
Median support: 16
Mean share of the best intron:  0.7372594956871039
Share of best introns in all:  0.4522015066378948
Mean support of the best intron:  79.10888876106267




### All the stats for HISAT are better, so this is the alignment chosen for the following analyses.

### After trying different cutoffs, threshold 50 was chosen, leaving quite a big set of comperatively reliable introns

In [35]:
cutoff = 50
file_in = f'data/good_junctions_hisat.bed'
file_out = f'data/best_introns_hisat{cutoff}.bed'

all_i, best_i = filter_junctions.choose_best_introns(file_in, file_out, cutoff)
filter_junctions.intron_stats(file_out)

filter_junctions.compare_best_introns(all_i, best_i)

Number of introns: 26247
Mean support: 349.5671505314893
Median support: 97
Mean share of the best intron:  0.9131872756561444
Share of best introns in all:  0.6623805780914107
Mean support of the best intron:  448.4654246199566




### Introns were also extracted from transcripts, so we compare the two sets and only choose the introns that are idenctical in both.

In [49]:
intron_comparison.extract_introns_from_gtf('data/longa_stringtie_strand_informed.gtf', 'data/other_introns.bed')
other_introns = intron_comparison.intron_dict('data/other_introns.bed')

file = 'data/best_introns_hisat50.bed'
file_out = 'data/introns_hisat_50_cross_checked.bed'
exact_match, one_side_match, diff_lengths = intron_comparison.compare_introns(file, other_introns, file_out)

print('exact match: ', len(exact_match))
# print(exact_match)

print('one side matching: ', len(one_side_match))
# print(one_side_match)

print('length difference distribution:')
print([(x, diff_lengths.count(x)) for x in range(10)])

All my introns:  26247
n of pairs:  31975
no match:  2627
exact match:  22036
one side matching:  1149
length difference distribution:
[(0, 0), (1, 291), (2, 81), (3, 113), (4, 29), (5, 12), (6, 9), (7, 4), (8, 4), (9, 2)]


### Preparing .bed files for extracting the sequences with additional margin (intron sequences + a bit of surrounding exons.

In [50]:
filter_junctions.introns_for_seq('data/introns_hisat_50_cross_checked.bed', 'data/best_introns_hisat_50+0.bed', 0)
filter_junctions.introns_for_seq('data/introns_hisat_50_cross_checked.bed', 'data/best_introns_hisat_50+3.bed', 3)

### After extracting intron sequences they are classified as: conventional if they have the canonical AG|GT junctions, confirmed nonconventional if they can form secondary structure in specific position or unconrfirmed nonconventional otherwise.

In [65]:
introns_seq_file = 'data/good_introns50+3.fasta' #file with the sequences

introns = intron_sequences.file_to_seq_introns(introns_seq_file, 3)
conv = []
non_conv = []
rest = []
for intron in introns:
    intron.movable_boundary()
    if intron.check_conventional():
        conv.append(intron)
    elif intron.check_unconventional():
        non_conv.append(intron)
    else:
        rest.append(intron)

print('conv stats: ')
_ = intron_sequences.seq_statistics(conv)

print('\nnonconv stats: ')
_ = intron_sequences.seq_statistics(non_conv)

print('\nrest: ')
_ = intron_sequences.seq_statistics(rest)

conv stats: 
Number of introns:  12815
Mean length:  878.709090909091
Mean gc:  0.4931897550119602
Tetranucleotides: 
[('TTTT', 146648), ('GGGG', 145918), ('AAAA', 143549), ('CCCC', 142800), ('TGTG', 135543), ('CACA', 135406), ('TTTG', 108378), ('CAAA', 108045), ('ACAC', 104849), ('GTGT', 104524)]

nonconv stats: 
Number of introns:  7602
Mean length:  643.9623783214944
Mean gc:  0.5143102063917632
Tetranucleotides: 
[('CCCC', 67815), ('GGGG', 63082), ('TTTT', 62311), ('TGTG', 57242), ('CACA', 55869), ('AAAA', 55338), ('TTTG', 46370), ('CAAA', 43809), ('GTGT', 43648), ('ACAC', 42479)]

rest: 
Number of introns:  5172
Mean length:  654.6084686774942
Mean gc:  0.5116076688333633
Tetranucleotides: 
[('GGGG', 47196), ('AAAA', 43535), ('CCCC', 42241), ('TGTG', 39809), ('TTTT', 38026), ('CACA', 37854), ('CAAA', 32501), ('TTTG', 30655), ('GTGT', 30050), ('ACAC', 28744)]
