# Query some results about the manually annotated test-set

Also contains some additional results (somewhat redundant with Parameterestimation.ipynb, but conceptually the basis thereof)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
complement = {"A": "T", "T": "A", "G": "C", "C": "G", "N": "N"}
base_order = "TCAG"
codon_list = [a + b + c for a in base_order for b in base_order for c in base_order]

rna_codon_list = [c.replace("T", "U") for c in codon_list]

def codon2index(codon):
    result = base_order.index(codon[0]) * 16
    result += base_order.index(codon[1]) * 4
    result += base_order.index(codon[2])
    return result


def rev_comp(sequence: str):
    result = ""
    for i in range(len(sequence)-1, -1, -1):
        result += complement[sequence[i]]
    return result

class Feature:
    def __init__(self, feature: str, start: int, end: int, strand: bool):
        self.start = start
        self.end = end
        self.strand = strand
        self.feature = feature
    
class Gene:
    def __init__(self, strand: bool):
        self.strand = strand
        self.features = [] # a sorted (!) list of CDS, introns and one UTR; sorted from 5' to 3', 
        # i.e. on reverse strand, the indices of subsequent features;
        # Sorting is trusted to the outside world; maybe check in isValid!
        
    def append(self, feature: Feature):
        self.features.append(feature)
        
    def __len__(self):
        return len(self.features)
    
    def __str__(self):
        return " ".join(["%s%s %s %s%s" % ("[" if f.strand else "<", 
                                           f.start, f.feature, f.end,
                                           ">" if f.strand else "]") 
                         for f in self.features])

    def get_range(self):
        return self.features[0].start, self.features[-1].end
    
    # sequence is nucleotide sequence from scaffolds.fasta
    # this produces the concatenated cds -- potentially complementarily reversed (if strand is negative)
    def get_contiguous_cds(self, sequence: str):
        cds = "".join([sequence[(f.start-1):f.end] for f in self.features if f.feature == "CDS"])
        if self.strand:
            return cds
        else:
            return rev_comp(cds)
        
    def codon_usage(self, sequence: str):
        cds = self.get_contiguous_cds(sequence)
        general = np.zeros(64)
        stop = np.zeros(64)
        general_n = 0
        for i in range(0, len(cds) - 21, 3):
            general[codon2index(cds[i:i+3])] += 1
            general_n += 1
            
        for i in range(len(cds) - 21, len(cds), 3):
            stop[codon2index(cds[i:i+3])] += 1
            
        print("term: %s : %s*UGA" % (cds[len(cds)-21:], stop[codon2index("TGA")]))
        return general, general_n, stop
        
    def count_UGA(self, sequence: str):
        inframe = 0
        outframe_in_cds = 0
        cds = self.get_contiguous_cds(sequence)
        for i in range(len(cds)-3):
            if sequence[i:i+3] == "TGA":
                if i % 3 == 0:
                    inframe += 1
                else:
                    outframe_in_cds += 1
        # REQUIRES this gene be valid
        if self.strand:
            utr = sequence[(self.features[-1].start-1):self.features[-1].end].count("TGA")
        else:
            utr = sequence[(self.features[0].start-1):self.features[0].end].count("TCA")
        
        
        return inframe, outframe_in_cds, utr
        
    def get_exon_intron_utr_stats(self):
        exon_lengths = []
        intron_lengths = []
        utr_index = -1 if self.strand else 0
        utr_length = self.features[utr_index].end - self.features[utr_index].start + 1 # +1 bc. gff is inclusive-end
        for f in self.features:
            if f.feature == "CDS":
                exon_lengths += [f.end - f.start + 1]
            elif f.feature == "intron":
                intron_lengths += [f.end - f.start + 1]
        
        return exon_lengths, intron_lengths, utr_length
        
    def is_valid(self):
        if len(self.features) == 0:
            print("! The gene is empty")
            return False
        
        utr_index = -1 if self.strand else 0
        first_cds_index = 0 if self.strand else -1
        
        # a Gene must end in a 3'UTR
        if self.features[utr_index].feature != "three_prime_UTR":
            print("! 3' Terminal feature is not a UTR, but a %s" % self.features[utr_index].feature)
            return False
        # a Gene must start with a CDS
        if self.features[first_cds_index].feature != "CDS":
            print("! 5' terminal feature is not a CDS, but a %s" % self.features[first_cds_index].feature)
            return False
        
        # Gene must start with CDS, end with UTR (len >= 2 so far), and can have [intron, CDS]-pairs in between 
        # (no two cds adjacent, nor two introns)
        if len(self.features) % 2 != 0:
            print("! uneven number of features")
            return False
        
        num_utrs = 0
        index = 0
        while index < len(self.features):
            if self.features[index].feature == "three_prime_UTR":
                num_utrs += 1
            if self.features[index+1].feature == "three_prime_UTR":
                num_utrs += 1
                
            if self.features[index].strand != self.strand or self.features[index+1].strand != self.strand:
                print("! Strand disagreement within the gene")
                return False
            
            if self.strand:
                if not (self.features[index].feature == "CDS" \
                        and self.features[index+1].feature in ["intron", "three_prime_UTR"]):
                    print("! CDS-intron-CDS-UTR-pattern violated")
                    return False
            else:
                if not (self.features[index+1].feature == "CDS" \
                        and self.features[index].feature in ["intron", "three_prime_UTR"]):
                    print("! CDS-intron-CDS-UTR-pattern violated")
                    return False
            
            index +=2
            
        if num_utrs != 1:
            print("! Too many utrs: %s" % num_utrs)
            return False   
        
        return True

class Annotation:
    def __init__(self, name: str, start: int, end: int):
        self.name = name
        self.start = start
        self.end = end
        self.features = [] # The list of features; could also separate this by strand
        self.genes = []
    
    def add(self, feature: Feature):
        index = 0
        while index < len(self.features):
            if self.features[index].start > feature.end:
                self.features.insert(index, feature)
                return
            index += 1
        self.features.append(feature)
        
    def compile_genes(self) -> bool:
        all_valid = True
        self.genes = []
        last_end = self.features[0].start - 1
        current_gene = Gene(self.features[0].strand)
        for f in self.features:
            if f.start != last_end + 1: # there is a break
                self.genes.append(current_gene)
                all_valid = all_valid and current_gene.is_valid()
                current_gene = Gene(f.strand)
                
            current_gene.append(f)
            last_end = f.end
        
        if len(current_gene) > 0:
            self.genes.append(current_gene)
            all_valid = all_valid and current_gene.is_valid()
        
        return all_valid
        
        
    def __str__(self):
        return self.name + ":\n\t" + ("\n\t".join(["[%s, %s]%s%s" % (f.start, f.end,
                                                                     f.feature, 
                                                                     "+" if f.strand else "-") 
                                               for f in self.features]))
        

### Read in an annotations-file:

In [None]:
annotations = []
with open("september-final-benchmarks/training.gff") as annotation_file:
    c = 0
    f = 0
    line = annotation_file.readline()
    current = None
    while line:
        line = annotation_file.readline()[:-1]
        content = line.split("\t")
        if line.startswith("##sequence-region"):
            print(current)
            if current is not None:
                correct = current.compile_genes()
                if not correct:
                    f += 1
                print(correct)
                print("Genes:\n\t" + "\n\t".join([str(g) for g in current.genes]))
                annotations.append(current)
            print("-  " * 36)
            
            current = Annotation(content[1], int(content[2]), int(content[3]))
            c += 1
        elif len(line) > 0 and not line.startswith("#"):
            current.add(Feature(content[2], int(content[3]), int(content[4]), content[6] == "+"))

print(current.compile_genes())
print("Genes:\n\t" + "\n\t".join([str(g) for g in current.genes]))
annotations.append(current)
print(c, "=", len(annotations), "of which", f, "were faulty")

### Read in the sequences from scaffolds and query number of inframe-TGA:

In [None]:
annotation_dict = {a.name:a for a in annotations}
num_of_genes = np.sum([len(a.genes) for a in annotations])

counts = np.zeros((num_of_genes,3))
codon_usage_general = np.zeros(64)
codon_usage_stop = np.zeros(64)
codon_number = 0
stop_codon_number = 0

start_regions = []
introns_by_length = {}

current_header = None
current_sequence = ""
gene_index = 0
with open("september-final-benchmarks/annotated_nodes.basic.train+test.fasta") as scaffolds:
    for row, line in enumerate(scaffolds):
        if line.startswith(">"):
            # Here query the analysis
            if current_header is not None:
                if current_header in annotation_dict:
                    print(current_header)
                    ann = annotation_dict[current_header]
                    for gene in ann.genes:
                        counts[gene_index, :] = np.array(gene.count_UGA(current_sequence))
                        print("\t", counts[gene_index,:])
                        if counts[gene_index,2] == 0:
                            print("\t\t\t\t!>!>!>missing STOP")
                        cds_codon_counts, cds_num_of_codons, stop_region = gene.codon_usage(current_sequence)
                        codon_number += cds_num_of_codons
                        stop_codon_number += 7
                        codon_usage_general += cds_codon_counts
                        codon_usage_stop += stop_region
                        
                        gene_index += 1
                        
                        # Collect intron sequences:
                        for f in [f for f in gene.features if f.feature == "intron"]:
                            curr_intron = current_sequence[(f.start-1):f.end]
                            curr_intron_length = f.end - f.start + 1
                            if not f.strand:
                                curr_intron = rev_comp(curr_intron)
                            if curr_intron_length not in introns_by_length:
                                introns_by_length[curr_intron_length] = [curr_intron]
                            else:
                                introns_by_length[curr_intron_length] += [curr_intron]
                            
                            print("intron: ", curr_intron)
                            
                        # Collect start-region:
                        g_start, g_end = gene.get_range()
                        if gene.strand:
                            start_regions += [current_sequence[max(0, 
                                                                   g_start-1-24):min(len(current_sequence),
                                                                                        g_start+2+9)]]
                        else:
                            start_regions += [rev_comp(current_sequence[max(0, 
                                                                            g_end-3-9):min(len(current_sequence),
                                                                                           g_end+24)])]
            
            current_header = line[1:-1]
            current_sequence = ""
        else:
            current_sequence += line[:-1]

# Start region: Looking for motifs

This was done _before_ the final version of the parameter-estimation was implemented, and is the data-basis for the design decision

In [None]:
print("."*24 + "ATG" + "-"*9)
start_region_bases = np.zeros((24+3+9, 4))
# print("\n".join(start_regions))
for r in [r for r in start_regions if len(r) == 24+3+9]:
    # print(r)
    for i in range(24+3+9):
        start_region_bases[i, base_order.index(r[i])] += 1
        
print(start_region_bases / 153)

# Intron sequence analysis:

Same as for start-region: this served to guide the model-design (and is thus here limited to the training-set, which was exclusively used for this, because the testing-set did not yet exist, and should not be used, not even indirectly, for the design of the model)

In [None]:
ls = [k for k in introns_by_length.keys()]
ls.sort()

motif_length = 10
intron_heads = []
intron_tails = []

for l in ls:
    # print(l)
    for i in introns_by_length[l]:
        intron_heads += [i[:motif_length]]
        intron_tails += [i[-motif_length:]]
        print("\t" + i + ":\t%sT %sC %sA %sG" % (i.count("T")/l, i.count("C")/l, i.count("A")/l, i.count("G")/l))
        
base_freq_head = np.zeros((4, motif_length))
base_freq_tail = np.zeros((4, motif_length))

n = len(intron_heads)

for i in range(n):
    for j in range(motif_length):
        base_freq_head[base_order.index(intron_heads[i][j]), j] += 1
        base_freq_tail[base_order.index(intron_tails[i][j]), j] += 1

print()
print("  " + "          ".join(base_order))
print((base_freq_head/n).T)
print("   ...")
print((base_freq_tail/n).T)

In [None]:
plt.hist(counts, label=["inframe", "outframe_in_cds", "utr"])
plt.legend()
plt.show()

In [None]:
def positional_base_usage(fs):
    positional_fs = np.zeros((3, 4))
    for c in codon_list:
        for i in range(3): # position in codon
            positional_fs[i, base_order.index(c[i])] += fs[codon2index(c)]
    return positional_fs

print(positional_base_usage(codon_usage_general / codon_number))
print(positional_base_usage(codon_usage_stop / stop_codon_number))
w = 0.5
plt.bar(np.arange(64) - w/2, codon_usage_general / codon_number, width=w)
plt.bar(np.arange(64) + w/2, codon_usage_stop / stop_codon_number, width=w)
plt.xticks(range(64), rna_codon_list, rotation=90)
plt.show()

In [None]:
names = ["inframe UGA", "outframe UGA in CDS", "UGA in 3'UTR"]
bins = [40, 40, 8]
means = np.mean(counts, axis=0)
medians = np.median(counts, axis=0)

for i in range(3):
    plt.subplot(1,3,i+1)
    plt.hist(counts[:,i], bins=bins[i],edgecolor="k",color="w", linewidth=0.5, align='left')
    plt.axvline(means[i],linestyle="dashed", color="k", linewidth=0.5)
    plt.axvline(medians[i], color="r", linewidth=0.5)
    plt.title(names[i])
    plt.xlabel("#UGA")
    plt.ylabel("#genes")

print("mean #inframe UGA:", np.mean(counts[:,0]))
print("median #inframe UGA:", np.median(counts[:,0]))
print("\nmean #outframe UGA in CDS:", np.mean(counts[:,1]))
print("median #outframe UGA in CDS:", np.median(counts[:,1]))
print("\nmean #UGA in 3'UTR:", np.mean(counts[:,2]))
print("median #UGA in 3'UTR:", np.median(counts[:,2]))

uga_counts = counts

plt.tight_layout(h_pad=-30)
plt.show()

In [None]:
intron_lengths = []
exon_lengths = []
utr_lengths = []
intron_counts = []
for ann in annotations:
    for gene in ann.genes:
        el, il, ul = gene.get_exon_intron_utr_stats()
        exon_lengths += el
        intron_lengths += il
        if (len(il) > 0 and np.max(il) > 30) or ul > 100:
            print(ann.name, il, ul)
            # Can see that the two largest ones (58, 85) are from 615, 
            # and are not very trusted (coverage-signal is ambiguous)
        utr_lengths += [ul]
        intron_counts += [len(il)]

datasets = [exon_lengths, utr_lengths, intron_counts, intron_lengths]
names = ["exon lengths", "3'UTR lengths", "intron counts", "intron lengths"]
bins = [30, 30, 5, 10]
        
for i in range(4):        
    plt.subplot(2,2,i+1) 
    plt.hist(datasets[i], bins=bins[i], edgecolor="k",color="w", linewidth=0.5, align='left')
    
    mean = np.mean(datasets[i])
    median = np.median(datasets[i])
    print("\n%s:\tmean=%s\tmedian=%s" % (names[i], mean, median))
    plt.axvline(mean,linestyle="dashed", color="k", linewidth=0.5)
    plt.axvline(median, color="r", linewidth=0.5)
    plt.title(names[i])
    
plt.tight_layout()
plt.show()
# intron lengths, exon lengths, utr lengths

In [None]:
no_introns = len([g for g in intron_counts if g == 0])
print(no_introns, "of", len(intron_counts), "genes have no intron, that's", 100*no_introns/len(intron_counts), "%")

In [None]:
intron_length_distr = np.bincount(intron_lengths)[:30]
print(",".join([str(p) for p in intron_length_distr / np.sum(intron_length_distr)]))

## Writing into csv for plotting in LaTe$\chi$

In [None]:
bins = [40, 40, 30, 5]
datasets = [uga_counts.T[0], uga_counts.T[1], exon_lengths, intron_counts]
filenames = ["UGA_inframe", "UGA_outframe", "exonlengths", "intron_counts"]
for i in range(4):
    with open("out/annotated_" + filenames[i] + ".csv", "w") as out:
        result = "xlower,y\n"
        hist, bin_edges = np.histogram(datasets[i], bins=bins[i])
        bar_width = [bin_edges[i+1] - bin_edges[i] for i in range(len(hist))][0]
        print(filenames[i], len(hist), bar_width * 2)
        result += "\n".join(["%s,%s" % (l,c) for (c, l) in zip(hist, bin_edges[:-1])])
        result += "\n%s,0" % bin_edges[-1]
        print(result)
        out.write(result)
    print()