# Various Utilities
This notebook collects the relevant part of the disjoined utility-scriptlets used during the thesis, e.g. for processing GFF-files

In [None]:
import numpy as np
import os

## Clean GFF-file
The annotation as given by Geneious from the RNA-mapping also includes (poorly) predicted Junctions, which are of no interest here. Thus, gff-files can be cleaned with the following code:

In [None]:
outfilename = "out/scaffolds_with_testset_september13.cleaned.gff"

with open(outfilename, "w") as out:
    out.write("##gff-version 3\n##source-version geneious 2020.1.2\n")
    
current = ""
had_rsoi = False # had regions of interest?
with open("scaffolds_with_testset_september13.gff") as gff, open(outfilename, "a") as out:
    for row, line in enumerate(gff):
        if row < 2: # header
            continue
        if line.startswith("##sequence-region"):
            if had_rsoi:
                print("<-", current[17:40])
                out.write(current)
            current = line
            had_rsoi = False
        elif "Junction" in line:
            continue
        else:
            had_rsoi = True
            current += "\t".join(line.split("\t")[:-1]) + "\t\n"

## Convert Augustus-output into convenient GFF-format
Remove some of the unnecessary elements, use to preprocess Augustus-output for evaluation of performance

In [None]:
for case in ["basic", 
             "base-u12d12", "base-u18d18", "base-u18d24", "base-u24d18", "base-u24d24", 
             "codon-u12d12", "codon-u18d18", "codon-u18d24", "codon-u24d18", "codon-u24d24"]:
    infile = "september-final-benchmarks/lab-augustus-%s-on-test.out" % case
    outfile = infile[:-3] + "cleaned.gff"

    retain_database = False

    with open(outfile, "w") as out:
        out.write("")

    with open(infile) as augustus, open(outfile, "a") as out:
        row = 0
        for line in augustus:
            if line.startswith("#"):
                out.write(line)
            elif line.startswith("NODE_"):
                contents = line.split("\t")
                splitpoint = contents[0].rfind("_")
                start, end = contents[0][splitpoint+1:].split("-")
                contents[0] = contents[0][:splitpoint]
                print(contents[0], start, end)
                if contents[1] == "database" and not retain_database:
                    print("\t ignoring database-entry")
                    continue

                contents[3] = str(int(contents[3]) + int(start) - 1)
                contents[4] = str(int(contents[4]) + int(start) - 1)
                print('\t'.join(contents))
                out.write("\t".join(contents))
            else:
                print("\t", row, ":", line[:-1])
            row += 1

## Convert blastx-output into annotation

This allows the annotation-gff file to be loaded into Geneious as its own track and to be used conveniently there. This further allows some visual exploration of the homology-results

In [None]:
class Hit:
    def __init__(self, name, frame=0, score=0, evalue=0):
        self.start_index = None
        self.end_index = -1 # last match to query (this is already w.r.t. nucleotides)
        self.frame = frame
        self.name = name
        self.score = score
        self.evalue = evalue
        self.startline = -1
        self.endline = -1
        self.sequence = ""
    
    def __str__(self):
        return "%s (ll.%s-%s): Score=%s, E=%s, Frame=%s: %s--%s" % (self.name, self.startline, self.endline, 
                                                                    self.score, self.evalue, 
                                                                    self.frame, self.start_index, self.end_index)
    
    
    
# # 1 # Convert blast (default out-format: extensive sequence-information contained) to gff
seq_name = "NODE_103_length_148054_cov_1.718544"
outfile = "out/blastx_q.NODE_103_wrt_ciliates_org.gff"

with open(outfile, "w") as out:
    out.write("")

current_hit = None
current_hitname = ""
hits = []

with open("blastx_q.NODE_103_wrt_ciliates_org.out") as blastx, open(outfile, "a") as out:
    for row, line in enumerate(blastx):
        if line.strip() == "":
            continue
            
        if line.startswith(">"):
            current_hitname = line[1:].replace("\t", " ")[:-1]
            
        if line.startswith(" Score = "):
            if current_hit is not None:
                hits.append(current_hit)
                print(current_hit)
                gff_start = min(current_hit.start_index, current_hit.end_index)
                gff_end = max(current_hit.start_index, current_hit.end_index)
                out.write("\t".join([seq_name, "blastx", "conserved", 
                                     str(gff_start), str(gff_end), 
                                     str(current_hit.score), "+" if current_hit.frame >= 0 else "-",
                                     str(abs(current_hit.frame)), 
                                     "matched=%s; starts=%s; ends=%s\n" % (current_hit.name, 
                                                                           current_hit.sequence[:10],
                                                                           current_hit.sequence[-10:])]))
                
            current_hit = Hit(current_hitname)
            current_hit.startline = row
            content = [e.strip() for e in line.split(",")]
            current_hit.score = float(content[0].split(" ")[2])
            current_hit.evalue = float(content[1].split(" ")[2])
            
        if line.startswith("Query "):
            current_hit.endline = row + 3
            current_hit.end_index = int(line.split(" ")[-1])
            current_hit.sequence += line.split(" ")[-3]
            if current_hit.start_index is None:
                current_hit.start_index = int(line.split("  ")[1])
        
        if line.startswith(" Frame = "):
            current_hit.frame = int(line[8:])

print(len(hits))


# # 2 # Transforming output format 6 into GFF (just some reordering/relabeling)
with open("ciliates/blastx_q.transcriptome_wrt_conserved_tetrahymena.out") as blastx, \
     open("out/blastx_q.transcriptome_wrt_tetra.gff", "w") as out:
    output = ""
    for line in blastx:
        qseqid, sseqid, _, length, _, _, qstart, qend, sstart, send, evalue, bitscore = line[:-1].split("\t")
        # Here, all hits happen to be on +strand
        output += "\t".join([qseqid, "blastx", "match", qstart, qend, bitscore, "+", ".", 'matches="%s"' % sseqid])
        output += "\n"
        
    out.write(output)

## Selecting regions for samtools view

While also collecting whither the specified transcript matched

In [None]:
which = "TRINITY_DN714_c0_g2_i1" # Here select the transcript, whose hits are desired: This was used for manual annotation
which_hit = []
tolerance = 300 # <- This is the point where the flanking-NCS-size was decided
regions_per_file = 18000
breakpoints = [8000, 16000, 36000]
n = 1

with open('blastn_q.polyA_wrt_LmagMAC_draft_genome.out') as blast_result:
    output = open('out/blastn_q.polyA_wrt_LmagMAC_dragen_%s.bed' % n, "w")
    for row, line in enumerate(blast_result):
        if row % 10000 == 0:
            print(row)
        # if row % regions_per_file == 0:
        if row in breakpoints:
            n += 1
            output.close()
            output = open('out/blastn_q.polyA_wrt_LmagMAC_dragen_%s.bed' % n, "w")
            
        
        content = line[:-1].split("\t")
        contig = content[1]
        start = int(content[8])
        end = int(content[9])
        larger = max(start, end)
        start = min(start, end)
        end = larger
        length = int(contig.split("_")[3])
        # print(content)
        output.write("%s\t%s\t%s\n" % (contig, max(0, start - tolerance), min(length, end + tolerance)))
        
        if content[0] == which:
            which_hit.append("%s q[%s, %s] s[%s, %s]\t %%id=%s, #gap=%s, e=%s, score=%s" % 
                             (content[1], content[6], content[7], content[8], content[9], 
                              content[3], content[5], content[10], content[11]))
    output.close()
    
print(which, "\n")
print("\n".join(which_hit))

## Splitting the training-annotation into test and training

In [None]:
node_whitelist = [1, 6, 7, 10, 12, 18, 19, 20, 21, 22, 23, 25,
           27, 29, 31, 38, 39, 42, 44, 54, 64, 65, 69, 70,
           71, 75, 78, 80, 84, 90, 91, 103, 104, 114, 127,
           132, 134, 135, 145, 150, 153, 155, 159, 160,
           164, 176, 177, 181, 202, 206, 207, 210, 217,
           222, 227, 244, 246, 250, 259, 262, 264, 265,
           267, 271, 273, 278, 279, 296, 297, 301, 322, 
           324, 336, 339, 344, 345, 358, 364, 376, 379, 
           386, 389, 410, 426, 427, 450, # 455, #<- this one contains NNN!
           469, 476, 479, 485, 487, 502, 508, 518, 521, 
           547, 555, 582, 585, 586, # 615, # <- this one has the over-long introns with little support from read-coverage
           624, 626, 632, 641, 662, 
           663, 712, 721, 725, 726, 748, 762, 774, 810, 
           847, 889, 893, 896, 918, 926, 998, 1067, 1083, 
           1087, 1148, 1178, 1295, 1401, 1404, 1482, 1721, 
           1864, 2222, 3449, 4094, 4127, 4354, 9798, 13726]
print(len(node_whitelist), "contigs were have useable annotation")

node_whitelist = [i for i in node_whitelist if i > 50] 
# reduce set slightly to make computations faster (throw out longest contigs)
print("->", len(node_whitelist))

# split into test and train:
test_size = 65
train_size = len(node_whitelist) - test_size
training_whitelist = np.random.choice(node_whitelist, train_size, replace=False)
test_whitelist = [i for i in node_whitelist if i not in training_whitelist]
print("#training =", len(training_whitelist), "#testing =", len(test_whitelist))
print("training:\n", training_whitelist)
print("testing:\n", test_whitelist)

In [None]:
infile_name = "scaffolds.fasta"
outfile_name = "out/intermediate_testing.fasta"

with open(outfile_name, "w") as out:
    out.write("")

current_header = None
with open(infile_name) as infile, open(outfile_name, "a") as out:
    for row, line in enumerate(infile):
        if line.startswith(">"):
            if int(line.split("_")[1]) in test_whitelist: # this previously was: in node_whitelist
                current_header = line[1:-1]
                out.write(line)
            else:
                current_header = None
        elif current_header is not None:
            out.write(line)
            
with open("scaffolds.only_training_dataset_before_september.cleaned.gff") as full_annotation, \
    open("out/intermediate_training.gff", "w") as outfile:
    out = ""
    for line in full_annotation:
        if line.startswith("##sequen"):
            if int(line.split("\t")[1].split("_")[1]) in training_whitelist:
                out += line
        elif line.startswith("N") and int(line.split("\t")[0].split("_")[1]) in training_whitelist:
            out+= line
        elif line.startswith("##") and not line.startswith("##seq"):
            out += line
    outfile.write(out)