# Annotate LTR retrotransposons using exisiting consensus sequences

#### Example data: annotate RLC_BdisC022 on chromosome 5 of *B. distachyon*
#### From Material & Methods: 
"Because a consensus library for the different TE families of *B. distachyon* is available on the TREP database (http://botserv2.uzh.ch/kelldata/trep-db/index.html), we used these sequences as a starting point to annotate LTR-RTs. For the sake of consistency, we used the same approach to annotate TEs in the new assembly and to re-annotate them in the reference genome. The LTR sequence of each of the 21 Copia and 19 Gypsy consensus sequences was blasted against the assemblies. Hits which covered at least 80 percent of the LTR were retained and sorted according to their position on the chromosome. We then traversed the sorted hits and compared adjacent LTR pairs. A hit pair was denoted intact if the two hits belonged to the same family, were on the same strand, and the distance between them corresponded to the distance expected from the consensus sequence, with an error margin of 20 percent to account for indels. Otherwise the hit was denoted a single LTR. A single LTR was classified as solo LTR if it was flanked by identical 4-mers, being evidence for a target site duplication (TSD), and lacked internal TE sequence in its 500 bp flanking regions. For the comparison of intact and solo elements, we only included intact elements satisfying the same stringent criteria, in this case requiring TSDs and the presence of internal TE sequence 500 bp up- or downstream of the LTRs."

In [1]:
#%% Define paths to input files, set parameters

# path to genome
genome = '/home/valentin-grenet/Bureau/Données/Resources_yann/GCF_009389715.1_palm_55x_up_171113_PBpolish2nd_filt_p_genomic.fna'
# path to TE consensus sequences
consensus_seqs = '/home/valentin-grenet/Bureau/Données/Resources_yann/Final_library.fasta'
# path to TE masking sequences
masking_seqs = '/home/valentin-grenet/Bureau/Données/LTR_cdhit_alignment/LTR_masking.fasta'
# path to TE masking statistics
masking_statistics = '/home/valentin-grenet/Bureau/Données/LTR_cdhit_alignment/Repeat_LTR_cdhit.out'

# only retain blast hits which cover at least min_ltr_cov of the LTRs
min_ltr_cov = 0.80
# imprecision rate allowed
imprec = 0.2
write_files = True

In [2]:
#%% Some classes and functions used for the annotation
import os
import subprocess

from Bio import SeqIO                           # Used to have a uniform interface for input and output sequence file formats
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord             # SeqRecord will be the object of the sequence file during the analysis in code


#%% some functions and classes

class LTR_hit:
    
    # def __init__(self, hit):                    # Initial function to create an object of class LTR_hit
        
    #     coords = list(map(int, hit[7:9]))       # Added list to allow several iterations
    #     strt, end = min(coords), max(coords)
        
    #     self.hit = {'aln_len' : int(hit[0]),        # alignment length
    #                 'pid' : float(hit[1]),          # Identity percentage
    #                 'eval' : float(hit[10]),        # e-value
                    
    #                 'TE' : hit[2].split(b'.')[0],   # TE name
    #                 'TE_strt' : int(hit[3]),        # TE start
    #                 'TE_end' : int(hit[4]),         # TE end
                    
    #                 'contig' : hit[5],
    #                 'c_strnd' : hit[6],             # strand sense
    #                 'c_strt' : strt,                # start on plus strand
    #                 'c_end' : end,                  # end on plus strand
    #                 'c_seq' : Seq(hit[9])           # TE sequence
    #                 }

    def __init__(self, hit, seq):                    # Initial function to create an object of class LTR_hit
            
            coords = list(map(int, hit[11:13]))       # Added list to allow several iterations
            strt, end = min(coords), max(coords)
            
            # print(int(hit[6])-int(hit[5]))
            # print(len(seq.seq))
            
            self.hit = {'aln_len' : int(hit[6])-int(hit[5])+1,        # alignment length
                        'pid' : 100-float(hit[1])-float(hit[2])-float(hit[3]),          # Identity percentage
                        #'eval' : float(hit[10]),        # e-value
                        'score' : int(hit[0]),
                        
                        'TE' : "%s:%s" % (hit[9],hit[10]),   # TE name
                        'TE_strt' : strt,        # TE start
                        'TE_end' : end,         # TE end
                        
                        'contig' : hit[4],
                        'c_strnd' : hit[8],             # strand sense
                        'c_strt' : int(hit[5]),                # start on plus strand
                        'c_end' : int(hit[6]),                  # end on plus strand
                        'c_seq' : seq.seq           # TE sequence
                        }

    def add_flanking(self, contigs, n=8):
        
        chrmsm = self.hit['contig']
        strt = self.hit['c_strt']
        end = self.hit['c_end']
        
        self.five_prime = contigs[str(chrmsm)][strt-(n+1) : strt-1]           # added str(...) to decode bytes
        
        self.three_prime = contigs[str(chrmsm)][end : end+n]
        
        if self.hit['c_strnd'] == '-':
            self.five_prime = self.five_prime.reverse_complement()
            self.three_prime = self.three_prime.reverse_complement()
            

def makeListHits(file_headers):
    # sequences = []
    annotations = []
    # for line in open(file_sequences, "r"):
    #     if line[0]!=">":
    #         sequences.append(line[:-1])
    # print(sequences)
    count=0
    for line in open(file_headers, "r"):
        if line[0]=="#":
            continue
        headers = []
        elements = line.split(" ")
        for i in range(0,len(elements)):
            if elements[i] != "":
                headers.append(elements[i])
        if headers[8] == "C":
            headers[8] = "-"
        if "(" in headers[11]:
            headers.pop(11)
        elif "(" in headers[13]:
            headers.pop(13)
        # headers.append(sequences[i])
        annotations.append(headers)
        # count+=1
    return(annotations)

def blastn(query, subject, outfile=False):
    '''Execute a blastn query and return the results'''
    cmd = ['blastn',
           '-query', query,
           '-subject', subject,
           '-outfmt', '6 length pident qseqid qstart qend sseqid sstrand sstart send sseq evalue'
           ]

    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    output = proc.stdout.read()
    blast_out = output.splitlines()
    
    if outfile:
        with open(outfile, 'w') as f:
            for line in blast_out:
                f.write(line + '\n')
    
    return [line.split(b'\t') for line in output.splitlines()]          # added a b in line.split to turn in bytes object


def is_tsd(five_seq, three_seq, overlap=4):

    tsd_max_len = len(five_seq)
    
    four_mers_l = []
    four_mers_r = []
    
    if five_seq and three_seq:
     
        for i in range(tsd_max_len):
            if i+4 < tsd_max_len+1:
                four_mers_l.append(five_seq[i:i+4])
                four_mers_r.append(three_seq[i:i+4])
                
    if any(x in four_mers_r for x in four_mers_l):
        return '1'
    else:
        return '0'

    
def check_LTR_context(ltr_h, contigs, te_info, n=500, seq_cov=0.1):
    """ Parameters:
        - n: number of TE internal base pairs considered
        - seq_cov: how much of these base pairs must be covered?
    """
    
    chrmsm = str(ltr_h.hit['contig'])     # added str cause of encoding    
    
    coords = [ltr_h.hit['c_strt'], ltr_h.hit['c_end']]      # contig coordinates
    strt, end = min(coords), max(coords)
    te = str(ltr_h.hit['TE']).replace(":","#")
    
    # attenzione, strand confusioni!
    if ltr_h.hit['c_strnd'] == '+':      # LTR context = 500 nt sequence before and after the contig coordinates
        left_seq = contigs[chrmsm][strt-n:strt]
        right_seq = contigs[chrmsm][end:end+n]
    else:
        left_seq = contigs[chrmsm][end:end+n].reverse_complement()
        right_seq = contigs[chrmsm][strt-n:strt].reverse_complement()
    
    probe_left = SeqRecord(left_seq, id='probe_left', name='probe_left', description='')
    probe_right = SeqRecord(right_seq, id='probe_right', name='probe_right', description='')

    # regions expected to be covered on consensus
    basename = te.split('#')[0] 
    right_exp = (te_info[te]['ltr_len'], te_info[te]['ltr_len'] + n)
    left_exp = (te_info[te]['full_len'] - te_info[te]['ltr_len'] - n,  te_info[te]['full_len'] - te_info[te]['ltr_len'])
    SeqIO.write([probe_left, probe_right], 'tmp.fasta', 'fasta')
    blastout = blastn('tmp.fasta', basename + '/' + basename + '.fasta')
    os.remove('tmp.fasta')
    best = 0

    if blastout:
        for hit in blastout:
            strt = min(map(int, hit[7:9]))
            end = max(map(int, hit[7:9]))
            
            if hit[2] == b'probe_left':
                ol = is_overlapping(left_exp, (strt, end))               
                if not ol:
                    out = 'solo_LTR'
                elif ol > best:
                    best = ol
                    out = 'three_prime_LTR' if ol > seq_cov else 'NA'
                    
    
            elif hit[2] == b'probe_right':
                ol = is_overlapping(right_exp, (strt, end))
                if not ol:
                    out = 'solo_LTR'
                elif ol > best:
                    best = ol
                    out = 'five_prime_LTR' if ol > seq_cov else 'NA'
    else:
        out = 'solo_LTR'
                    
    return out
            
            
def is_overlapping(a,b):
    a_range = set(range(a[0], a[1]))
    b_range = set(range(b[0], b[1]))

    intersection = a_range & b_range
    union = a_range | b_range

    if len(intersection) == 0:
        return False
    else:
        return len(intersection) / float(len(union))

    
def assign_TE_cat(a, b, te_info, imprec, min_ltr_cov=0.8):
    
    """ Determine TE insertion category:
    
    - full length
    - solo LTR
    - single LTR
    - tsd yes or no
    
    1 in the output means TSD is present
    """
    
    te = str(a.hit['TE']).replace(":","#")                # added str cause of bit format, and here we got the LTR name
    if a.hit['aln_len'] < len(ltrs[te]) * min_ltr_cov:     # if not enough alignment between genome and consensus compared to selfblasted length, then it is truncated
        return 'trunc', 0, 0
    
    aa_tsd = is_tsd(a.five_prime, a.three_prime)           # test of TSD presence
    a_cntxt = check_LTR_context(a, contigs, te_info)
    
    if not b or te != b.hit['TE']: 
        return 'single', aa_tsd, a_cntxt
      
    else:
        b_cntxt = check_LTR_context(b, contigs, te_info)
        internal = te_info[te]['full_len'] -(2*te_info[te]['ltr_len'])
        interval = b.hit['c_strt'] - a.hit['c_end']
        ab_tsd = is_tsd(a.five_prime, b.three_prime)
        cntxt = a_cntxt + '-' + b_cntxt
        
        if (1-imprec)*internal < interval < (1+imprec)*internal and a.hit['c_strnd'] == b.hit['c_strnd']:
            return 'paired', ab_tsd, cntxt
        
        else:
            return  'single', aa_tsd, a_cntxt

In [3]:
#%% Load sequence data and get the LTR sequence for each element by blasting themselves on themselves
contigs = {seq_record.id: seq_record.seq for seq_record in SeqIO.parse(genome, "fasta")}
print(contigs)        # dictionnaire des séquences du génome à partir du fichier fasta
# SeqIO.parse sert justement à séparer les id de séquences (les parties avec ">") et les seq de séquences
tes = {seq_record.id: seq_record for seq_record in SeqIO.parse(consensus_seqs, "fasta")}            # dictionnaire des séquences consensus des TEs
masks = {seq_record.id: seq_record for seq_record in SeqIO.parse(masking_seqs, "fasta")}
annotations = makeListHits(masking_statistics)
print(annotations[0])
ltrs = {}

os.chdir('/home/valentin-grenet/Bureau/Données/LTR_cdhit_alignment')
os.mkdir('tmp')
os.chdir('tmp')

for seq in tes:
    basename = seq.split('#')[0]                                                                                     # Pour chaque séquence consensus
    print(seq)
    SeqIO.write(tes[seq], basename + '.fasta', 'fasta')                                                  # fichier fasta pour chaque séquence
    
    selfblast = blastn(basename + '.fasta', basename + '.fasta')[1:]                                          # selfblast sur la séquence consensus
    selfblast.sort(key=lambda x: (int(x[7])))
    
    ltrs[seq] = SeqRecord(Seq(selfblast[0][-2]), id=seq, name=seq, description='')                  # obtenir la séquence pour laquelle on a eu le meilleur blast                 
    os.remove(basename + '.fasta')

os.chdir('..')
os.rmdir('tmp')


{'chr1': Seq('tgatcacaaggtctgtaaactgcaaaggtccatctatggacaaaagcaagcatc...ATT'), 'chr2': Seq('CTTTGTTgtaatttttctaaattatatCACGCCTGCAATGCTTACTTAAAAGCC...gtc'), 'chr3': Seq('CTCATCACGCCAGGCCAAACCACCAACGTTCTCCTCCAAACCAAGCCCAGCCTC...tag'), 'chr4': Seq('CCACTGCATGCTCCTGCATGAGTTTCATGAATGGTTTGTTGAGCCATGTCGTCA...agt'), 'chr5': Seq('gtatctccttacgggacttcttcgccggtggggccccgctcggaagagctcgtt...TTT'), 'chr6': Seq('TGTAACCAATGCACTATAACCGCCATCAAAAATATTATCTGGGTAGTAGTGTAA...TAT'), 'chr7': Seq('CGTGATAGTTTCAGATCCCCAAGTACGTACCTGGACCATAAAATCTTTCATCAC...CTG'), 'chr8': Seq('TCTTCCGCCACCTCTGCCACTGCCGATAAGCTCCCTTCAGCCTTGATACAAAGA...TGT'), 'chr9': Seq('gacatgacatccaaccgggcgtccaatcggctcgtgagcccatcagtaatagtc...ttt'), 'chr10': Seq('ATTTAGCttaagaaaaaccaattcagtTATTTCAACTACATCGAATGATCTTTC...att'), 'chr11': Seq('AATTATTAGGAGTGTAGGAAATCAAATTAAAAGTTGAAACATTGTATCATTTCT...att'), 'chr12': Seq('ATACTTTTCATGTTTGTGGCCAAGAGGTGATTCTGGTGATCTCGGAGGGCACGA...cct'), 'chr13': Seq('GGTGGCAACACTCTAAATGGAGAAAACATAGAGAAAGGAAGATAGGGTTTTGGG...T

In [4]:
#%% Blast LTRs and parse hits in the reference genome from the consensus sequences of LTRs (mapping)

blast_d = {}
te_info = {}
overlaps = []
stats = {}      # Added
i = 0

for te in tes:           # for each masked result, we are going to create a ltr_hit object
    
    """ Blast LTR of each TE family against the assembly, 
    create LTR_hit object and extract sequences flanking the hit
    """
    
    print(tes[te])
    print('Annotating ' + te)           # Annotating RLC_BdisC022
    basename = te.split('#')[0]   
    
    os.mkdir(basename)
    os.chdir(basename)

    SeqIO.write(tes[te], basename + '.fasta', 'fasta')
    SeqIO.write(ltrs[te], basename + '.LTR.fasta', 'fasta')
    full_len = len(tes[te])                                                 # length of LTR consensus sequence
    ltr_len = len(ltrs[te])                                                 # length of LTR blasted sequence
    te_info[te] = {'full_len' : full_len, 'ltr_len' : ltr_len}

    os.chdir('..')
    # """ Blast LTR against targets, create dictionary with blast output with chromosomes as keys
    # """
    # stats[te] = {'retained' : 0 , 'trunc' : 0, 'overlap' : 0}
    # blast_out = blastn(te + '.LTR.fasta', genome)            # blast LTR sequence against annotated
    
for seq in masks:           # for each blast result
    hit = annotations[i]
    if i%1000 == 0:
        print(i)
        # Only consider hits covering at least 80% of the LTR
        # aln_len = int(hit[0])
#        if aln_len < min_ltr_cov*ltr_len:
#            stats[te]['trunc'] +=1
#            continue
                    
    chrmsm = hit[4]
    if chrmsm not in blast_d:                       # add the chromosome to the list
        print(chrmsm)
        blast_d[chrmsm] = {}
    
    coords = list(map(int, hit[5:7]))
    strt, end = min(coords), max(coords)
        
    ltr_h = LTR_hit(hit,masks[seq])                            # save blast result as a LTR hit, and so all informations about the LTR hit (with flanking sequences)
    ltr_h.add_flanking(contigs)
        
        
    # overlapping coordinates: keep hit with higher PID
    overlap = False
        
    worse_hits_to_remove = []
    for region in blast_d[chrmsm]:          # For each chromosome region
        ol = is_overlapping((strt,end), region)
            
        if ol:              # if there is an overlapping and the new ltr_hit has a better score, the actual result is prepared to be removed
            overlap = True
            # if ltr_h.hit['eval'] < blast_d[chrmsm][region].hit['eval']:
            if ltr_h.hit['score'] > blast_d[chrmsm][region].hit['score']:
                worse_hits_to_remove.append(region)
        
    if worse_hits_to_remove:        # here we remove the hits to change and replace them
        for h in worse_hits_to_remove:
            blast_d[chrmsm].pop(h)
        blast_d[chrmsm][(strt, end)] = ltr_h
                
    elif overlap == False:          # if no overlapping, the region is empty and the result has to be added anyway
        blast_d[chrmsm][(strt, end)] = ltr_h
    
    i+=1


ID: consensus_Cluster_100_subfam_1#LTR:Copia
Name: consensus_Cluster_100_subfam_1#LTR:Copia
Description: consensus_Cluster_100_subfam_1#LTR:Copia
Number of features: 0
Seq('TGAGGATGATAGTGGTATTTTGATGATTAATACAATaATTAAGAGTATGTnnnn...ACA')
Annotating consensus_Cluster_100_subfam_1#LTR:Copia


FileExistsError: [Errno 17] File exists: 'consensus_Cluster_100_subfam_1'

In [5]:
#%% Identify full length and solo copies, write gff and fasta files
annot = []
seqs = {}
ltr_seqs = {}
annot_stats = {}
i=0

for chrmsm in blast_d:
        print(chrmsm)
        
        """ Order blast hits and compare adjacent hits in order
        to distinguish solo from full length elements,
        and to identify overlapping annotations
        """
        
        coords = list(blast_d[chrmsm].keys())           # added list to make .sort functional
        coords.sort(key=lambda x: (int(x[0])))          # sort coordinates in chromosome

        a = False
        
        for i,x  in enumerate(coords):                  # i = iteration (0,1,2...) ; x = coordinates
            
            if not a: # initiate contig
                a = blast_d[chrmsm][x]
                
                if len(coords) == 1: # single hit in the contig
                    b = False
                    cat, tsd, ltr_cntxt = assign_TE_cat(a, b, te_info, imprec)
                    
                else:
                    continue                # the first iteration return directly to the for loop
            
            b = blast_d[chrmsm][x]          # at this point a = b (so same informations on the same LTR_hit object)
            cat, tsd, ltr_cntxt = assign_TE_cat(a, b, te_info, imprec)      # te_info = consensus and blasted LTR lengths ; imprec = imprecision rate
            
            """ Put together output info
            """
            TE = str(a.hit['TE'])
            strt = a.hit['c_strt']
            flanking_left = a.five_prime
            # careful with fasta header names, avoid - and :
            name = TE + '_' + str(chrmsm) + '_' + str(strt)
            
            if TE not in annot_stats:
                annot_stats[TE] = {'paired' : 0, 'single' : 0, 'solo_LTR' : 0, 'full' : 0, 'other' : 0, 'trunc':0}
                seqs[TE] = []
                ltr_seqs[TE] = []
            
            # strand
            strand = a.hit['c_strnd']
            
            # end coordinate
            if cat == 'paired':
                flanking_right = b.three_prime
                end = b.hit['c_end']
            else:
                flanking_right = a.three_prime
                end = a.hit['c_end']
                

            # Categorize and get LTR sequences 
            if cat == 'single':
                if ltr_cntxt == 'solo_LTR'and tsd == '1':
                    subcat = 'solo_LTR'
                else:
                    subcat = 'other'
                    
                ltr_rec = SeqRecord(a.hit['c_seq'], id=name, name=name, description=cat + '_' + subcat)
                ltr_seqs[TE].append(ltr_rec)
                
                
            if cat == 'paired':
                
                if strand == '-' and ltr_cntxt in ['three_prime_LTR-five_prime_LTR', 'three_prime_LTR-NA', 'NA-five_prime_LTR'] or \
                   strand == '+' and ltr_cntxt in ['five_prime_LTR-three_prime_LTR', 'NA-three_prime_LTR', 'five_prime_LTR-NA']:
                       
                    subcat = 'full' if tsd == '1' else 'other'
                    
                else:
                    subcat = 'other'
    
                ltr_recA = SeqRecord(a.hit['c_seq'], id=name + '_A', name=name + '_A', description=cat + '_' + subcat)
                ltr_recB = SeqRecord(b.hit['c_seq'], id=name + '_B', name=name + '_B', description=cat + '_' + subcat)
                ltr_seqs[TE] += [ltr_recA, ltr_recB]
                
            if cat == 'trunc':
                subcat = 'trunc'
                
                
            # TE sequence and length
            seq = contigs[str(chrmsm)][strt-1 : end]
            aln_len = len(seq)
            if strand == '-':
                seq = seq.reverse_complement()
            rec = SeqRecord(seq, id=name, name=name, description=cat+ '_' + subcat)
            seqs[TE].append(rec)
                

            """ Write reference hits to gff
            """
            info = 'Name=%s,fam=%s,cat=%s,subcat=%s,tsd=%s,ltr_context=%s,five_flanking=%s,three_flanking=%s,aln_len=%s' % \
                    (name, TE, cat, subcat, tsd, ltr_cntxt, flanking_left, flanking_right, aln_len)
                    
            annot.append([chrmsm, 'detettore', 'LTR-RT', str(strt), str(end), '.', strand, '.', info])

            annot_stats[TE][cat] += 1 
            annot_stats[TE][subcat] += 1
            
            # Skip next entry if it was part of a full element
            if cat  == 'paired':
                a = False
            else:
                a = b

NW_024069918.1
NW_024069919.1
NW_024069920.1




NW_024069921.1




NW_024069922.1




NW_024069923.1
NW_024069924.1
NW_024069925.1
NW_024069926.1
NW_024069928.1




NW_024069929.1
NW_024069930.1
NW_024069931.1
NW_024069932.1
NW_024069933.1




NW_024069934.1




NW_024069935.1
NW_024069936.1
NW_024069937.1
NW_024069938.1
NW_024069939.1




NW_024069940.1
NW_024069941.1
NW_024069942.1
NW_024069943.1




NW_024069944.1
NW_024069945.1
NW_024069946.1
NW_024069947.1
NW_024069948.1
NW_024069949.1
NW_024069950.1
NW_024069951.1
NW_024069952.1
NW_024069953.1
NW_024069954.1




NW_024069955.1
NW_024069956.1
NW_024069957.1
NW_024069958.1




NW_024069959.1
NW_024069960.1
NW_024069961.1
NW_024069962.1
NW_024069963.1
NW_024069964.1




NW_024069965.1
NW_024069966.1




NW_024069967.1
NW_024069968.1




NW_024069969.1
NW_024069970.1
NW_024069972.1
NW_024069973.1




NW_024069974.1
NW_024069975.1




NW_024069976.1
NW_024069977.1
NW_024069978.1




NW_024069979.1
NW_024069980.1
NW_024069981.1
NW_024069982.1
NW_024069983.1
NW_024069984.1




NW_024069985.1




NW_024069986.1




NW_024069987.1
NW_024069988.1
NW_024069989.1




NW_024069990.1
NW_024069991.1
NW_024069992.1
NW_024069993.1
NW_024069994.1




NW_024069995.1
NW_024069996.1
NW_024069997.1




NW_024069998.1
NW_024069999.1




NW_024070000.1




NW_024070001.1
NW_024070002.1




NW_024070003.1




NW_024070004.1




NW_024070005.1




NW_024070006.1
NW_024070007.1
NW_024070008.1




NW_024070009.1




NW_024070010.1
NW_024070011.1
NW_024070012.1




NW_024070013.1
NW_024070014.1




NW_024070015.1
NW_024070016.1




NW_024070017.1
NW_024070018.1
NW_024070019.1




NW_024070020.1




NW_024070021.1
NW_024070022.1




NW_024070023.1
NW_024070024.1




NW_024070025.1
NW_024070026.1




NW_024070027.1




NW_024070028.1




NW_024070029.1
NW_024070031.1
NW_024070032.1
NW_024070033.1




NW_024070034.1
NW_024070035.1
NW_024070036.1




chr1




chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr2
chr3
chr4
chr5




chr6
chr7
chr8
chr9




In [6]:
# Write files
                
stats_header = ['TE', 'paired', 'single', 'full', 'solo_LTR', 'other']
              
if write_files:
    annot.sort(key=lambda x: (x[0], int(x[3])))
    with open('LTR-RTs.dez2018.gff', 'w') as out:
        for a in annot:
            out.write('\t'.join(a)+'\n')
    
    for te in seqs:
        basename = te.split(':')[0] 
        SeqIO.write(seqs[te], basename + '.all_seqs.fasta', 'fasta')
        SeqIO.write(ltr_seqs[te], basename + '.ltr_seqs.fasta', 'fasta')
        
    
    with open('annotation_counts.tsv', 'w') as f:
        f.write('\t'.join(stats_header) + '\n')
        for te in annot_stats:
            outline = [te] + [str(annot_stats[te][c]) for c in stats_header[1:]]
            f.write('\t'.join(outline) + '\n')