Step 0 : Paths, Modules and Functions

In [2]:
#%% Files

# Path to genome
genome = '/home/valentin-grenet/Bureau/Données/Resources_yann/GCF_009389715.1_palm_55x_up_171113_PBpolish2nd_filt_p_genomic.fna'
# Path to RepeatMasker hits of all hits
Repeat_all_data = '/home/valentin-grenet/Bureau/Données/Resources_yann/Repeat_LTR_all.out'
# Path to RepeatMasker hits of cdhit
Repeat_cdhit_data = '/home/valentin-grenet/Bureau/Données/Resources_yann/Repeat_LTR_cdhit.out'
# Path to Inpactor hits of all hits
initial_library = '/home/valentin-grenet/Bureau/Données/Resources_yann/initial_library.fasta'
# Path to Inpactor hits of cdhit
final_library = '/home/valentin-grenet/Bureau/Données/Resources_yann/Final_library.fasta'
# Path to sequences of genome contigs
contig_dir = '/home/valentin-grenet/Bureau/Données/genome_sequences'
# Path to sequences of consensus
sequences_dir = '/home/valentin-grenet/Bureau/Données/LTR_cdhit_sequences'
# Path to libraries
libraries_dir = '/home/valentin-grenet/Bureau/Données/Gene_libraries/'
# Path to Dante classification
Dante_data = '/home/valentin-grenet/Bureau/Données/Resources_yann/DANTE_initial_like.txt'
# Path to output directory
results_dir = '/home/valentin-grenet/Bureau/Données/cdhit_classification'
# Path to MrBayes
mb = "/home/valentin-grenet/Bureau/Outils/MrBayes/src/mb"


#%% Modules

import os
import subprocess

from Bio import SeqIO                           # Used to have a uniform interface for input and output sequence file formats
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord             # SeqRecord will be the object of the sequence file during the analysis in code


Step 1 : Extraction of RepeatMasker hit coordinates

In [3]:
def ExtractCoordinatesRepeat(file, dico_hits):     # file = masking_hits
    '''Function used to extract all hit coordinates obtained with
    RepeatMasker.'''
    index=0
    for line in open(file, "r"):
        index+=1                # line indices (begin with 1)
        hit = {}
        if line[0]=="#":    # skip header lines
            continue

        column = []
        elements = line.split(" ")
        for i in range(0,len(elements)):
            if elements[i] != "":
                column.append(elements[i])
        if column[9] not in dico_hits:      # list of coordinates for each consensus
            dico_hits[column[9]] = []
            # print(column[9])
        hit["contig"] = column[4]
        hit["contig_start"] = column[5]
        hit["contig_stop"] = column[6]
        hit["strand"] = column[8]
        strand = 0
        if hit["strand"] == "C":
            strand = 1
        hit["TE_start"] = column[11+strand]
        hit["TE_stop"] = column[12+strand]
        hit["perc_id"] = 100 - sum(map(float, column[1:3]))
        hit["index"] = index
        dico_hits[column[9]].append(hit)
    return(dico_hits)

dico_repeat_hits = {}
# dico_repeat_hits = ExtractCoordinatesRepeat(Repeat_all_data, dico_repeat_hits)
dico_repeat_hits = ExtractCoordinatesRepeat(Repeat_cdhit_data, dico_repeat_hits)
for hit in dico_repeat_hits["consensus_Cluster_67_subfam_2"]:
    print(hit)

{'contig': 'NC_016740.1', 'contig_start': '102549', 'contig_stop': '103252', 'strand': 'C', 'TE_start': '5985', 'TE_stop': '5278', 'perc_id': 62.0, 'index': 3}
{'contig': 'NC_016740.1', 'contig_start': '313453', 'contig_stop': '313596', 'strand': '+', 'TE_start': '5869', 'TE_stop': '6015', 'perc_id': 65.5, 'index': 8}
{'contig': 'NW_024067666.1', 'contig_start': '807948', 'contig_stop': '807981', 'strand': '+', 'TE_start': '808', 'TE_stop': '841', 'perc_id': 97.1, 'index': 126}
{'contig': 'NW_024067666.1', 'contig_start': '2465696', 'contig_stop': '2470367', 'strand': '+', 'TE_start': '1', 'TE_stop': '4679', 'perc_id': 96.9, 'index': 234}
{'contig': 'NW_024067670.1', 'contig_start': '936493', 'contig_stop': '936560', 'strand': '+', 'TE_start': '779', 'TE_stop': '848', 'perc_id': 74.4, 'index': 1180}
{'contig': 'NW_024067671.1', 'contig_start': '115754', 'contig_stop': '115842', 'strand': '+', 'TE_start': '12034', 'TE_stop': '12120', 'perc_id': 91.9, 'index': 1915}
{'contig': 'NW_024067

Step 2 : Extraction of sequences (LTRs, genome and consensus)

In [3]:
def ExtractLengths(consensus, file, dico_sequences):
    dico_sequences[consensus] = 0
    if file in os.listdir('.'):
        for line in open(file, "r"):
            if line[0] != ">":
                dico_sequences[consensus] += len(line)-1
    return dico_sequences

# dico_genome = ExtractSequences(genome, {})
dico_genome = {seq_record.id: seq_record for seq_record in SeqIO.parse(genome, "fasta")}
# dico_TEs = ExtractSequences(initial_library, {})

os.chdir(sequences_dir)
dico_consensus = {}; dico_5LTR = {}; dico_RT = {}; dico_RH = {}; dico_PROT = {}; dico_INT = {}; dico_GAG = {}; dico_3LTR = {}
for consensus in os.listdir('.'):
    if "c" in consensus:
        os.chdir(consensus)
        if "c" in consensus:
            dico_consensus = ExtractLengths(consensus, consensus + ".fasta", dico_consensus)
            dico_5LTR = ExtractLengths(consensus, consensus + ".5-LTR.fasta", dico_5LTR)
            dico_RT = ExtractLengths(consensus, consensus + ".RT.fasta", dico_RT)
            dico_RH = ExtractLengths(consensus, consensus + ".RH.fasta", dico_RH)
            dico_PROT = ExtractLengths(consensus, consensus + ".PROT.fasta", dico_PROT)
            dico_INT = ExtractLengths(consensus, consensus + ".RT.fasta", dico_INT)
            dico_GAG = ExtractLengths(consensus, consensus + ".GAG.fasta", dico_GAG)
            dico_3LTR = ExtractLengths(consensus, consensus + ".3-LTR.fasta", dico_3LTR)
        os.chdir("..")

dico_length = {}
for consensus in dico_consensus:
    dico_length[consensus] = {"complete":dico_consensus[consensus], 
                              "5-LTR":dico_5LTR[consensus],
                              "RT":dico_RT[consensus],
                              "RH":dico_RH[consensus],
                              "PROT":dico_PROT[consensus],
                              "INT":dico_INT[consensus],
                              "GAG":dico_GAG[consensus],
                              "3-LTR":dico_3LTR[consensus],}
                            #   "LTR":len(dico_5LTR[consensus])}
    print(consensus, dico_length[consensus])


c304 {'complete': 7159, '5-LTR': 398, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 397}
c82_s1 {'complete': 13555, '5-LTR': 1378, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 1371}
c144_s2 {'complete': 11995, '5-LTR': 981, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 965}
c0 {'complete': 10209, '5-LTR': 2357, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 2354}
c204_s1 {'complete': 10874, '5-LTR': 1698, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 1693}
c53_s1 {'complete': 13177, '5-LTR': 851, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 851}
c103_s2 {'complete': 12728, '5-LTR': 900, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 899}
c62_s2 {'complete': 13619, '5-LTR': 1339, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 1336}
c64_s2 {'complete': 6756, '5-LTR': 856, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 855}
c221_s2 {'complete': 10991, '5-LTR': 2617, 'RT': 0, 'RH': 0, 

Step 3.1 : Simple recognition - Classification with length of LTR and full TE

In [None]:
def CheckLength(hit, length, count):
    length_hit = int(hit["stop"]) - int(hit["start"])
    if length_hit < 0.8*length["5-LTR"]:
        count[0] += 1
        return "trunc", count
    elif length_hit > 0.8*length["complete"]:
        count[3] += 1
        return "paired", count
    elif length_hit > 0.8*(length["complete"]-length["5-LTR"]):
        count[2] += 1
        return "single", count
    else:
        count[1] += 1
        return "solo", count

os.chdir(sequences_dir)
headers = ["index", "contig", "start", "stop", "length", "strand", "element"]
dico_count = {}

for consensus in dico_repeat_hits :
    print(consensus)
    os.chdir(consensus)
    output_consensus = open(consensus + ".LTR_types.tsv", "w")
    output_consensus.write("\t".join(headers) + "\n")
    count = [0,0,0,0]
    
    for hit in dico_repeat_hits[consensus]:
        status, count = CheckLength(hit, dico_length[consensus], count)
        print(count)
        line = [str(hit["index"]), hit["contig"], hit["start"], hit["stop"], str(int(hit["stop"])-int(hit["start"])), hit["strand"], status]
        output_consensus.write("\t".join(line)+"\n")
    
    output_consensus.close()
    dico_count[consensus] = count
    os.chdir("..")

output_count = open("annotation_counts.tsv", "w")
headers = ["consensus", "length", "trunc", "solo", "single", "paired"]
output_count.write("\t".join(headers) + "\n")

for consensus in dico_count:
    output_count.write(consensus + "\t" + str(dico_length[consensus]["complete"]) + "\t" + "\t".join(str(i) for i in dico_count[consensus]) + "\n")   

Step 3.2 : Advanced recognition - Classification with length and blast of all elements

In [8]:
# plutot faire un principe de repérer un LTR par blast, puis regarder autour si on a un autre LTR
# utiliser les coordonnées matchées sur le consensus
# ajouter la divergence dans le dico de repeat (avec donc les coordonnées des consensus masqués)

def ClassLTR(hit, consensus_length, consensus_hits, count):
    if CheckComplete(hit, consensus_length):
        count[4] += 1
        return [hit], "complete", count
    else:
        if CheckLTR(hit, consensus_length):
            if CheckPaired(hit, consensus_length, consensus_hits):
                TE_hits = BuildTE(hit, consensus_length, consensus_hits)
                count[3] += 1
                return TE_hits, "paired", count
            else:
                if CheckSingle(hit, consensus_length, consensus_hits):
                    TE_hits = BuildTE(hit, consensus_length, consensus_hits)
                    count[2] += 1
                    return TE_hits, "single", count
                else:
                    count[1] += 1
                    return [hit], "solo", count
        else:
            count[0] += 1
            return [], "trunc", count

def CheckComplete(hit, consensus_length):
    length_hit = int(hit["contig_stop"]) - int(hit["contig_start"])
    return length_hit >= 0.8*consensus_length["complete"]

def CheckLTR(hit, consensus_length):
    if hit["strand"] == "+" :
        if int(hit["TE_start"]) <= consensus_length["5-LTR"] \
        or int(hit["TE_stop"]) >= consensus_length["complete"]-consensus_length["3-LTR"]:
            return int(hit["TE_stop"])-int(hit["TE_start"]) >= 0.5*consensus_length["5-LTR"] \
                and int(hit["perc_id"]) >= 75
        else:
            return False
    else:
        if int(hit["TE_stop"]) <= consensus_length["5-LTR"] \
        or int(hit["TE_start"]) >= consensus_length["complete"]-consensus_length["3-LTR"]:
            return int(hit["TE_start"])-int(hit["TE_stop"]) >= 0.5*consensus_length["5-LTR"] \
                and int(hit["perc_id"]) >= 75
        else:
            return False

def CheckPaired(query_hit, consensus_length, consensus_hits):
    if int(query_hit["TE_start"]) < consensus_length["5-LTR"] :          # change due to error of pairing with itself, so if we have the 5-LTR
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"])
        else:
            coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"]
        # coord_min = coord_start + consensus_length["complete"] - (1.2*consensus_length["3-LTR"])
        coord_max = coord_start + consensus_length["complete"] + (0.2*consensus_length["3-LTR"])
        print(query_hit["index"], coord_start, coord_max)
        for other_hit in consensus_hits:
            if coord_start <= int(other_hit["contig_stop"]) <= coord_max \
            and query_hit["contig"] == other_hit["contig"] \
            and query_hit["index"] != other_hit["index"]:               # added for smaller TE paired anyway (linked with suppression of coord_min)
                if CheckLTR(other_hit, consensus_length):
                    return True
    else:                                                                   # else we have the 3-LTR
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"]
        else:
            coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"])
        coord_min = coord_start - consensus_length["complete"] - (0.2*consensus_length["3-LTR"])
        # coord_max = coord_start - consensus_length["complete"] + (1.2*consensus_length["3-LTR"])
        print(query_hit["index"], coord_start, coord_min)
        for other_hit in consensus_hits:
            if coord_min <= int(other_hit["contig_start"]) <= coord_start \
            and query_hit["contig"] == other_hit["contig"] \
            and query_hit["index"] != other_hit["index"]:               # added for smaller TE paired anyway (linked with suppression of coord_max)
                if CheckLTR(other_hit, consensus_length):
                    return True
    return False

def CheckSingle(query_hit, consensus_length, consensus_hits):
    length_hit = int(hit["contig_stop"]) - int(hit["contig_start"])
    if length_hit >= consensus_length["5-LTR"]+500:
        return True
    else:
        if int(query_hit["TE_start"]) < consensus_length["5-LTR"] :          # change due to error of pairing with itself
            if query_hit["strand"] == "+" :
                coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"]) + consensus_length["5-LTR"]
            else:
                coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"] + consensus_length["5-LTR"]
            for other_hit in consensus_hits:
                if coord_start <= int(other_hit["contig_start"]) <= coord_start+500 \
                and query_hit["contig"] == other_hit["contig"]:
                    return True
        else:
            if query_hit["strand"] == "+" :
                coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"] - consensus_length["5-LTR"]
            else:
                coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"]) - consensus_length["5-LTR"]
            for other_hit in consensus_hits:
                if coord_start-500 <= int(other_hit["contig_stop"]) <= coord_start \
                and query_hit["contig"] == other_hit["contig"]:
                    return True
        return False

def BuildTE(query_hit, consensus_length, consensus_hits):
    TE_hits = []
    if int(query_hit["TE_start"]) < consensus_length["5-LTR"] :          # change due to error of pairing with itself
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"])
        else:
            coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"]
        for other_hit in consensus_hits:
            # print(other_hit["contig_start"])
            if coord_start <= int(other_hit["contig_start"]) <= coord_start+1.2*consensus_length["complete"] \
            and query_hit["contig"] == other_hit["contig"]:
                TE_hits.append(other_hit)
    else:
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"]
        else:
            coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"])
        for other_hit in consensus_hits:
            if coord_start-1.2*consensus_length["complete"] <= int(other_hit["contig_stop"]) <= coord_start \
            and query_hit["contig"] == other_hit["contig"]:
                TE_hits.append(other_hit)
    return TE_hits



os.chdir(sequences_dir)
headers = ["index", "contig", "start", "stop", "length", "strand", "id_percentage", "TE_start", "TE_stop", "classification", "index", "start", "stop", "length", "strand", "id_percentage", "TE_start", "TE_stop"]
dico_count = {}

for consensus in dico_repeat_hits :
    print(consensus)
    os.chdir(consensus)
    output_consensus = open(consensus + ".LTR_classif.tsv", "w")
    output_consensus.write("\t".join(headers) + "\n")
    count = [0,0,0,0,0]
    hits_used = []
    
    for hit in dico_repeat_hits[consensus]:
        if hit in hits_used:
            continue
        else:
            TE_hits, status, count = ClassLTR(hit, dico_length[consensus], dico_repeat_hits[consensus], count)
            line = [str(hit["index"]), hit["contig"], hit["contig_start"], hit["contig_stop"], str(int(hit["contig_stop"])-int(hit["contig_start"])), hit["strand"], str(hit["perc_id"]), hit["TE_start"], hit["TE_stop"], status]
            output_consensus.write("\t".join(line))
            if len(TE_hits) == 0 :
                output_consensus.write('\tno match\n')
            else:
                new_line = '\t'
                for other_hit in TE_hits:
                    hits_used.append(other_hit)
                    line = [str(other_hit["index"]), other_hit["contig_start"], other_hit["contig_stop"], str(int(other_hit["contig_stop"])-int(other_hit["contig_start"])), other_hit["strand"], str(other_hit["perc_id"]), other_hit["TE_start"], other_hit["TE_stop"]]
                    output_consensus.write(new_line + "\t".join(line) + "\n")
                    new_line = 10*'\t'

    
    output_consensus.close()
    dico_count[consensus] = count
    os.chdir("..")

output_count = open("annotation_counts.tsv", "w")
headers = ["consensus", "length", "trunc", "solo", "single", "paired", "complete"]
output_count.write("\t".join(headers) + "\n")

for consensus in dico_count:
    output_count.write(consensus + "\t" + str(dico_length[consensus]["complete"]) + "\t" + "\t".join(str(i) for i in dico_count[consensus]) + "\n")   

NameError: name 'dico_repeat_hits' is not defined

Step 3.3 : Major updates on recognition - 3.2 is working well, so now we are making ameliorations

done :
- for reverse strand sense, uncomplete fragment then added for a single or a paired LTR are still written as trunc -> option to keep them in memory, and then write them as trunc or in the paired or single LTR, and modify the count list
- add the option to recheck the completion of the LTR analyzed to pass it from paired to complete (or fragmented)
- manage the cases where the LTR is complete, but too small to be considered as a complete (internal sequences truncated) --> considered as shrunked
- change of management between LTR scan on the left or on the right, because it was not working well before

comment on the efficiency :
the script will never be perfect as the threshold will always miss some LTR which should be considered as a classification above (false negative) and consider some LTR in a classification above where they should be (false positive)

In [None]:
def SearchSense(strand, TE_start, TE_stop, LTR_length):
    sign = strand == "+"
    number = TE_start < LTR_length or TE_stop < LTR_length
    if sign and number:
        return True
    elif not sign and not number:
        return True
    else:
        return False
    
def FixCount(status, count):
    list_status = ["trunc", "solo", "single", "paired", "fragmented", "shrunked", "complete"]
    count[list_status.index(status)] -= 1
    return count

def ClassLTR(hit, consensus_length, consensus_hits, count):
    if CheckComplete(hit, consensus_length):
        count[6] += 1
        return [hit], "complete", count
    elif CheckShrunked(hit, consensus_length):            # peut-être clippé
        count[5] += 1
        return [hit], "shrunked", count
    elif CheckLTR(hit, consensus_length):
        if CheckPaired(hit, consensus_length, consensus_hits):
            TE_hits = BuildTE(hit, consensus_length, consensus_hits)
            if CheckFragmented(TE_hits, consensus_length):
                count[4] += 1
                return TE_hits, "fragmented", count
            else:
                count[3] += 1
                return TE_hits, "paired", count
        elif CheckSingle(hit, consensus_length, consensus_hits):
            TE_hits = BuildTE(hit, consensus_length, consensus_hits)
            count[2] += 1
            return TE_hits, "single", count
        else:
            count[1] += 1
            return [hit], "solo", count
    else:
        count[0] += 1
        return [], "trunc", count

def CheckComplete(hit, consensus_length):
    length_hit = int(hit["contig_stop"]) - int(hit["contig_start"])
    return length_hit >= 0.8*consensus_length["complete"]

def CheckShrunked(hit, consensus_length):
    coord_5LTR = 0.5*consensus_length["5-LTR"]
    coord_3LTR = consensus_length["complete"] - 0.5*consensus_length["3-LTR"]
    if hit["strand"] == "+":
        return int(hit["TE_start"]) < coord_5LTR and int(hit["TE_stop"]) > coord_3LTR
    else:
        return int(hit["TE_stop"]) < coord_5LTR and int(hit["TE_start"]) > coord_3LTR

def CheckLTR(hit, consensus_length):
    if hit["strand"] == "+" :
        if int(hit["TE_start"]) <= consensus_length["5-LTR"] \
        or int(hit["TE_stop"]) >= consensus_length["complete"]-consensus_length["3-LTR"]:
            return int(hit["TE_stop"])-int(hit["TE_start"]) >= 0.5*consensus_length["5-LTR"] \
                and int(hit["perc_id"]) >= 75
        else:
            return False
    else:
        if int(hit["TE_stop"]) <= consensus_length["5-LTR"] \
        or int(hit["TE_start"]) >= consensus_length["complete"]-consensus_length["3-LTR"]:
            return int(hit["TE_start"])-int(hit["TE_stop"]) >= 0.5*consensus_length["5-LTR"] \
                and int(hit["perc_id"]) >= 75
        else:
            return False

def CheckPaired(query_hit, consensus_length, consensus_hits):
    if SearchSense(query_hit["strand"], int(query_hit["TE_start"]), int(query_hit["TE_stop"]), consensus_length["5-LTR"]):          # change due to error of pairing with itself, so if we have the 5-LTR
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"])
        else:
            coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"]
        coord_min = coord_start + consensus_length["complete"] - (1.2*consensus_length["3-LTR"])
        coord_max = coord_start + consensus_length["complete"] + (1.2*consensus_length["3-LTR"])
        if query_hit["contig"] == "NW_024067682.1":
            print(query_hit["index"], coord_min, coord_max)
        for other_hit in consensus_hits:
            if coord_min <= int(other_hit["contig_stop"]) <= coord_max \
            and query_hit["contig"] == other_hit["contig"] \
            and query_hit["index"] != other_hit["index"]:               # added for smaller TE paired anyway (linked with suppression of coord_min)
                if CheckLTR(other_hit, consensus_length):
                    return True
    else:                                                                   # else we have the 3-LTR
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"]
        else:
            coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"])
        coord_min = coord_start - consensus_length["complete"] - (1.2*consensus_length["3-LTR"])
        coord_max = coord_start - consensus_length["complete"] + (1.2*consensus_length["3-LTR"])
        for other_hit in consensus_hits:
            if coord_min <= int(other_hit["contig_start"]) <= coord_max \
            and query_hit["contig"] == other_hit["contig"] \
            and query_hit["index"] != other_hit["index"]:               # added for smaller TE paired anyway (linked with suppression of coord_max)
                if CheckLTR(other_hit, consensus_length):
                    return True
    return False

def CheckFragmented(TE_hits, consensus_length):
    length_covered = 0
    for hit in TE_hits:
        if length_covered == 0:
            length_covered = int(hit["contig_stop"]) - int(hit["contig_start"])
            coord_max = int(hit["contig_stop"])
        else:
            length_covered += int(hit["contig_stop"]) - max(coord_max, int(hit["contig_start"]))
    return length_covered >= 0.8*consensus_length["complete"]

def CheckSingle(query_hit, consensus_length, consensus_hits):
    length_hit = int(hit["contig_stop"]) - int(hit["contig_start"])
    if length_hit >= consensus_length["5-LTR"]+500:
        return True
    else:
        if SearchSense(query_hit["strand"], int(query_hit["TE_start"]), int(query_hit["TE_stop"]), consensus_length["5-LTR"]):           # change due to error of pairing with itself
            if query_hit["strand"] == "+" :
                coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"]) + consensus_length["5-LTR"]
            else:
                coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"] + consensus_length["5-LTR"]
            for other_hit in consensus_hits:
                if coord_start <= int(other_hit["contig_start"]) <= coord_start+500 \
                and query_hit["contig"] == other_hit["contig"]:
                    return True
        else:
            if query_hit["strand"] == "+" :
                coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"] - consensus_length["5-LTR"]
            else:
                coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"]) - consensus_length["5-LTR"]
            for other_hit in consensus_hits:
                if coord_start-500 <= int(other_hit["contig_stop"]) <= coord_start \
                and query_hit["contig"] == other_hit["contig"]:
                    return True
        return False

def BuildTE(query_hit, consensus_length, consensus_hits):
    TE_hits = []
    if SearchSense(query_hit["strand"], int(query_hit["TE_start"]), int(query_hit["TE_stop"]), consensus_length["5-LTR"]):           # change due to error of pairing with itself
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"])
        else:
            coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"]
        for other_hit in consensus_hits:
            # print(other_hit["contig_start"])
            if coord_start <= int(other_hit["contig_start"]) <= coord_start+1.2*consensus_length["complete"] \
            and query_hit["contig"] == other_hit["contig"]:
                TE_hits.append(other_hit)
    else:
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"]
        else:
            coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"])
        for other_hit in consensus_hits:
            if coord_start-1.2*consensus_length["complete"] <= int(other_hit["contig_stop"]) <= coord_start \
            and query_hit["contig"] == other_hit["contig"]:
                TE_hits.append(other_hit)
    return TE_hits



os.chdir(sequences_dir)
headers = ["index", "contig", "start", "stop", "length", "strand", "id_percentage", "TE_start", "TE_stop", "classification", "index", "start", "stop", "length", "strand", "id_percentage", "TE_start", "TE_stop"]
dico_count = {}

for consensus in dico_repeat_hits :
    print(consensus)
    os.chdir(consensus)
    output_consensus = open(consensus + ".LTR_classif.tsv", "w")
    output_consensus.write("\t".join(headers) + "\n")
    count = [0,0,0,0,0,0,0]
    hits_used = []
    list_lines = []
    
    for hit in dico_repeat_hits[consensus]:
        if hit in hits_used:
            continue
        else:
            TE_hits, status, count = ClassLTR(hit, dico_length[consensus], dico_repeat_hits[consensus], count)
            ref_line = [str(hit["index"]), hit["contig"], hit["contig_start"], hit["contig_stop"], str(int(hit["contig_stop"])-int(hit["contig_start"])), hit["strand"], str(hit["perc_id"]), hit["TE_start"], hit["TE_stop"], status]
            list_lines.append(ref_line)
            if len(TE_hits) == 0 :
                list_lines.append(["\tno_match\n"])
            else:
                new_line = ''
                for other_hit in TE_hits:
                    hits_used.append(other_hit)
                    for line in list_lines:
                        if str(other_hit["index"]) == line[0] and line != ref_line:
                            del list_lines[list_lines.index(line):list_lines.index(line)+2]
                            count = FixCount(line[-1], count)
                    line = [new_line, str(other_hit["index"]), other_hit["contig_start"], other_hit["contig_stop"], str(int(other_hit["contig_stop"])-int(other_hit["contig_start"])), other_hit["strand"], str(other_hit["perc_id"]), other_hit["TE_start"], other_hit["TE_stop"], "\n"]
                    list_lines.append(line)
                    new_line = 9*'\t'

    for line in list_lines:
        output_consensus.write("\t".join(line))
    output_consensus.close()
    dico_count[consensus] = count
    os.chdir("..")

output_count = open("annotation_counts.tsv", "w")
headers = ["consensus", "length", "trunc", "solo", "single", "paired", "fragmented", "shrunked", "complete"]
output_count.write("\t".join(headers) + "\n")

for consensus in dico_count:
    output_count.write(consensus + "\t" + str(dico_length[consensus]["complete"]) + "\t" + "\t".join(str(i) for i in dico_count[consensus]) + "\n")   

Step 3.4 : Minor updates on recognition - 3.3 is working very well, so now we are making optimisations

to do :
- uncomplete fragments added after the end of the LTR --> count them as trunc, or keep them as they could be part of the TE ? (ex : 34971 for c67_s2)
- sometimes, a complete fragment is considered as paired --> identify them with maybe a second check of complete, or the impossibility to add a too long fragment (ex : 154500 in consensus67_s2)
- consider the paired LTR too close to be considered as paired (important lack of internal sequence) (ex : TE of 9000 instead of 12000 for 5346 in c67_s2), but manage the threshold to not paired the LTR with himself if the LTR is too long
- parameters modified by the user

done :
- completion factor from 0.8 to 0.95 because for the next step, some complete fragments didn't have both LTR

comment on the efficiency :
the script will never be perfect as the threshold will always miss some LTR which should be considered as a classification above (false negative) and consider some LTR in a classification above where they should be (false positive)

In [5]:
def SearchSense(strand, TE_start, TE_stop, LTR_length):
    sign = strand == "+"
    number = TE_start < LTR_length or TE_stop < LTR_length
    if sign and number:
        return True
    elif not sign and not number:
        return True
    else:
        return False
    
def FixCount(status, count):
    list_status = ["trunc", "solo", "single", "paired", "fragmented", "shrunked", "complete"]
    count[list_status.index(status)] -= 1
    return count

def ClassLTR(hit, consensus_length, consensus_hits, count):
    if CheckComplete(hit, consensus_length):
        count[6] += 1
        return [hit], "complete", count
    elif CheckShrunked(hit, consensus_length):            # peut-être clippé
        count[5] += 1
        return [hit], "shrunked", count
    elif CheckLTR(hit, consensus_length):
        if CheckPaired(hit, consensus_length, consensus_hits):
            TE_hits = BuildTE(hit, consensus_length, consensus_hits)
            if CheckFragmented(TE_hits, consensus_length):
                count[4] += 1
                return TE_hits, "fragmented", count
            else:
                count[3] += 1
                return TE_hits, "paired", count
        elif CheckSingle(hit, consensus_length, consensus_hits):
            TE_hits = BuildTE(hit, consensus_length, consensus_hits)
            count[2] += 1
            return TE_hits, "single", count
        else:
            count[1] += 1
            return [hit], "solo", count
    else:
        count[0] += 1
        return [], "trunc", count

def CheckComplete(hit, consensus_length):
    length_hit = int(hit["contig_stop"]) - int(hit["contig_start"])
    return length_hit >= 0.95*consensus_length["complete"]

def CheckShrunked(hit, consensus_length):
    coord_5LTR = 0.5*consensus_length["5-LTR"]
    coord_3LTR = consensus_length["complete"] - 0.5*consensus_length["3-LTR"]
    if hit["strand"] == "+":
        return int(hit["TE_start"]) < coord_5LTR and int(hit["TE_stop"]) > coord_3LTR
    else:
        return int(hit["TE_stop"]) < coord_5LTR and int(hit["TE_start"]) > coord_3LTR

def CheckLTR(hit, consensus_length):
    if hit["strand"] == "+" :
        if int(hit["TE_start"]) <= consensus_length["5-LTR"] \
        or int(hit["TE_stop"]) >= consensus_length["complete"]-consensus_length["3-LTR"]:
            return int(hit["TE_stop"])-int(hit["TE_start"]) >= 0.5*consensus_length["5-LTR"] \
                and int(hit["perc_id"]) >= 75
        else:
            return False
    else:
        if int(hit["TE_stop"]) <= consensus_length["5-LTR"] \
        or int(hit["TE_start"]) >= consensus_length["complete"]-consensus_length["3-LTR"]:
            return int(hit["TE_start"])-int(hit["TE_stop"]) >= 0.5*consensus_length["5-LTR"] \
                and int(hit["perc_id"]) >= 75
        else:
            return False

def CheckPaired(query_hit, consensus_length, consensus_hits):
    if SearchSense(query_hit["strand"], int(query_hit["TE_start"]), int(query_hit["TE_stop"]), consensus_length["5-LTR"]):          # change due to error of pairing with itself, so if we have the 5-LTR
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"])
        else:
            coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"]
        coord_min = coord_start + consensus_length["complete"] - (1.2*consensus_length["3-LTR"])
        coord_max = coord_start + consensus_length["complete"] + (1.2*consensus_length["3-LTR"])
        if query_hit["contig"] == "NW_024067682.1":
            print(query_hit["index"], coord_min, coord_max)
        for other_hit in consensus_hits:
            if coord_min <= int(other_hit["contig_stop"]) <= coord_max \
            and query_hit["contig"] == other_hit["contig"] \
            and query_hit["index"] != other_hit["index"]:               # added for smaller TE paired anyway (linked with suppression of coord_min)
                if CheckLTR(other_hit, consensus_length):
                    return True
    else:                                                                   # else we have the 3-LTR
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"]
        else:
            coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"])
        coord_min = coord_start - consensus_length["complete"] - (1.2*consensus_length["3-LTR"])
        coord_max = coord_start - consensus_length["complete"] + (1.2*consensus_length["3-LTR"])
        for other_hit in consensus_hits:
            if coord_min <= int(other_hit["contig_start"]) <= coord_max \
            and query_hit["contig"] == other_hit["contig"] \
            and query_hit["index"] != other_hit["index"]:               # added for smaller TE paired anyway (linked with suppression of coord_max)
                if CheckLTR(other_hit, consensus_length):
                    return True
    return False

def CheckFragmented(TE_hits, consensus_length):
    length_covered = 0
    for hit in TE_hits:
        if length_covered == 0:
            length_covered = int(hit["contig_stop"]) - int(hit["contig_start"])
            coord_max = int(hit["contig_stop"])
        else:
            length_covered += int(hit["contig_stop"]) - max(coord_max, int(hit["contig_start"]))
    return length_covered >= 0.8*consensus_length["complete"]

def CheckSingle(query_hit, consensus_length, consensus_hits):
    length_hit = int(hit["contig_stop"]) - int(hit["contig_start"])
    if length_hit >= consensus_length["5-LTR"]+500:
        return True
    else:
        if SearchSense(query_hit["strand"], int(query_hit["TE_start"]), int(query_hit["TE_stop"]), consensus_length["5-LTR"]):           # change due to error of pairing with itself
            if query_hit["strand"] == "+" :
                coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"]) + consensus_length["5-LTR"]
            else:
                coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"] + consensus_length["5-LTR"]
            for other_hit in consensus_hits:
                if coord_start <= int(other_hit["contig_start"]) <= coord_start+500 \
                and query_hit["contig"] == other_hit["contig"]:
                    return True
        else:
            if query_hit["strand"] == "+" :
                coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"] - consensus_length["5-LTR"]
            else:
                coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"]) - consensus_length["5-LTR"]
            for other_hit in consensus_hits:
                if coord_start-500 <= int(other_hit["contig_stop"]) <= coord_start \
                and query_hit["contig"] == other_hit["contig"]:
                    return True
        return False

def BuildTE(query_hit, consensus_length, consensus_hits):
    TE_hits = []
    if SearchSense(query_hit["strand"], int(query_hit["TE_start"]), int(query_hit["TE_stop"]), consensus_length["5-LTR"]):           # change due to error of pairing with itself
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_start"]) - int(query_hit["TE_start"])
        else:
            coord_start = int(query_hit["contig_start"]) + int(query_hit["TE_start"]) - consensus_length["complete"]
        for other_hit in consensus_hits:
            # print(other_hit["contig_start"])
            if coord_start <= int(other_hit["contig_start"]) <= coord_start+1.2*consensus_length["complete"] \
            and query_hit["contig"] == other_hit["contig"]:
                TE_hits.append(other_hit)
    else:
        if query_hit["strand"] == "+" :
            coord_start = int(query_hit["contig_stop"]) - int(query_hit["TE_stop"]) + consensus_length["complete"]
        else:
            coord_start = int(query_hit["contig_stop"]) + int(query_hit["TE_stop"])
        for other_hit in consensus_hits:
            if coord_start-1.2*consensus_length["complete"] <= int(other_hit["contig_stop"]) <= coord_start \
            and query_hit["contig"] == other_hit["contig"]:
                TE_hits.append(other_hit)
    return TE_hits



os.chdir(sequences_dir)
headers = ["index", "contig", "start", "stop", "length", "strand", "id_percentage", "TE_start", "TE_stop", "classification", "index", "start", "stop", "length", "strand", "id_percentage", "TE_start", "TE_stop"]
dico_count = {}

for consensus in dico_repeat_hits :
    print(consensus)
    os.chdir(consensus)
    output_consensus = open(consensus + ".LTR_classif.tsv", "w")
    output_consensus.write("\t".join(headers) + "\n")
    count = [0,0,0,0,0,0,0]
    hits_used = []
    list_lines = []
    
    for hit in dico_repeat_hits[consensus]:
        if hit in hits_used:
            continue
        else:
            TE_hits, status, count = ClassLTR(hit, dico_length[consensus], dico_repeat_hits[consensus], count)
            ref_line = [str(hit["index"]), hit["contig"], hit["contig_start"], hit["contig_stop"], str(int(hit["contig_stop"])-int(hit["contig_start"])), hit["strand"], str(hit["perc_id"]), hit["TE_start"], hit["TE_stop"], status]
            list_lines.append(ref_line)
            if len(TE_hits) == 0 :
                list_lines.append(["\tno_match\n"])
            else:
                new_line = ''
                for other_hit in TE_hits:
                    hits_used.append(other_hit)
                    for line in list_lines:
                        if str(other_hit["index"]) == line[0] and line != ref_line:
                            del list_lines[list_lines.index(line):list_lines.index(line)+2]
                            count = FixCount(line[-1], count)
                    line = [new_line, str(other_hit["index"]), other_hit["contig_start"], other_hit["contig_stop"], str(int(other_hit["contig_stop"])-int(other_hit["contig_start"])), other_hit["strand"], str(other_hit["perc_id"]), other_hit["TE_start"], other_hit["TE_stop"], "\n"]
                    list_lines.append(line)
                    new_line = 9*'\t'

    for line in list_lines:
        output_consensus.write("\t".join(line))
    output_consensus.close()
    dico_count[consensus] = count
    os.chdir("..")

output_count = open("annotation_counts.tsv", "w")
headers = ["consensus", "length", "trunc", "solo", "single", "paired", "fragmented", "shrunked", "complete"]
output_count.write("\t".join(headers) + "\n")

for consensus in dico_count:
    output_count.write(consensus + "\t" + str(dico_length[consensus]["complete"]) + "\t" + "\t".join(str(i) for i in dico_count[consensus]) + "\n")   

consensus_Cluster_67_subfam_2
5346 940834.4 942965.6
consensus_Cluster_103_subfam_2
consensus_subfam23_Cluster_34
consensus_Cluster_55_subfam_1
consensus_Cluster_62_subfam_1
consensus_Cluster_227
5429 1241689.2 1247902.8
consensus_Cluster_160_subfam_2
consensus_Cluster_215_subfam_2
consensus_Cluster_332
consensus_Cluster_113
consensus_Cluster_194
consensus_Cluster_160_subfam_1
consensus_Cluster_46_subfam_1
consensus_Cluster_419
consensus_Cluster_226
consensus_Cluster_71_subfam_2
consensus_Cluster_103_subfam_1
consensus_Cluster_336_subfam_1
consensus_Cluster_411
consensus_Cluster_186
5168 411383.4 417654.6
consensus_Cluster_21_subfam_2
5100 94855.6 99948.4
consensus_Cluster_112
consensus_Cluster_205_subfam_1
5259 734790.8 738717.2
consensus_Cluster_69_subfam_1
consensus_Cluster_172_subfam_1
5465 1333080.8 1340463.2
5534 1432981.8 1440364.2
consensus_Cluster_165_subfam_1
consensus_Cluster_204_subfam_2
5469 1346319.8 1350078.2
consensus_Cluster_207_163
consensus_Cluster_320
consensus_Clus

In [7]:
output_count = open("annotation_counts.tsv", "w")
headers = ["consensus", "length", "trunc", "solo", "single", "paired", "fragmented", "shrunked", "complete"]
output_count.write("\t".join(headers) + "\n")

for consensus in dico_count:
    output_count.write(consensus + "\t" + str(dico_length[consensus]["complete"]) + "\t" + "\t".join(str(i) for i in dico_count[consensus]) + "\n")   

Step 4 : Extraction of LTR coordinates

In [6]:
# {consensus1 : [liste de LTR], consensus2 : [liste de LTR], ...}
# liste de LTR = [{LTR1_hit1}, {LTR1_hit2}, {LTR15_hit1}, {LTR15_hit2}, ...]
# LTR1_hit1 = {id : index_status_5LTR, contig, contig_start, contig_stop, strand, id_percentage, TE_start, TE_stop}
# then we will add the sequence to that dictionnary

def Check5LTR(TE_start, TE_stop, strand, consensus_length):
    if strand == "+":
        return TE_start < consensus_length["5-LTR"] \
           and min(TE_stop,consensus_length["5-LTR"]) - TE_start > 0.8*consensus_length["5-LTR"]
    else:
        return TE_stop < consensus_length["5-LTR"]\
           and min(TE_start,consensus_length["5-LTR"]) - TE_stop > 0.8*consensus_length["5-LTR"]

def Check3LTR(TE_start, TE_stop, strand, consensus_length):
    if strand == "+":
        return TE_stop > consensus_length["complete"] - consensus_length["3-LTR"] \
           and TE_stop - max(TE_start,consensus_length["complete"]-consensus_length["3-LTR"]) > 0.5*consensus_length["3-LTR"]
    else:
        return TE_start > consensus_length["complete"] - consensus_length["3-LTR"] \
           and TE_start - max(TE_stop,consensus_length["complete"]-consensus_length["3-LTR"]) > 0.5*consensus_length["3-LTR"]

def CountRepetitions(list_hits, id):
    i=1
    new_id = id
    for hit in list_hits:
        if id in hit["id"]:
            i+=1
            new_id = id + "." + str(i)
    return new_id

def WriteLTR(getfasta_library, dico_LTR, consensus_length, contig_length):
    print(id)
    if dico_LTR["strand"] == "+":
        if "5LTR" in dico_LTR["id"]:
            coordinate_start = max(int(dico_LTR["contig_start"]) - int(dico_LTR["TE_start"]) - 50, 1)
            coordinate_stop = min(int(dico_LTR["contig_stop"]), coordinate_start+consensus_length["5-LTR"]+100)
        else:
            coordinate_stop = min(int(dico_LTR["contig_stop"]) + consensus_length["complete"] - int(dico_LTR["TE_stop"]) + 50, contig_length)
            coordinate_start = max(int(dico_LTR["contig_start"]), coordinate_stop-consensus_length["3-LTR"]-100)
    else:
        dico_LTR["strand"] = "-"
        if "5LTR" in dico_LTR["id"]:
            coordinate_stop = min(int(dico_LTR["contig_stop"]) + int(dico_LTR["TE_stop"]) + 50, contig_length)
            coordinate_start = max(int(dico_LTR["contig_start"]), coordinate_stop-consensus_length["5-LTR"]-100)
        else:
            coordinate_start = max(int(dico_LTR["contig_start"]) - consensus_length["complete"] + int(dico_LTR["TE_start"]) - 50, 1)
            coordinate_stop = min(int(dico_LTR["contig_stop"]), coordinate_start+consensus_length["3-LTR"]+100)
    line = [dico_LTR["contig"], str(coordinate_start), str(coordinate_stop), dico_LTR["id"], "0", dico_LTR["strand"]]
    getfasta_library.write("\t".join(line) + "\n")
    # remember the columns in bed files :
    # contig    contig_start    contig_stop     name    (score)   strand

def AddLTR(elements, id, contig):
    return {"id":id, "contig":contig,
            "contig_start":elements[11],
            "contig_stop":elements[12],
            "strand":elements[14],
            "id_percentage":elements[15],
            "TE_start":elements[16],
            "TE_stop":elements[17] }

contigs = {seq_record.id: seq_record.seq for seq_record in SeqIO.parse(genome, "fasta")}
for contig in contigs:
    dico_length[contig] = len(contigs[contig])

os.chdir(sequences_dir)
dico_LTRs_by_consensus = {}
consensus = "c87_s2"
# for consensus in os.listdir("."):
if consensus == "c87_s2":
    getfasta_library = open(consensus + "/" + consensus + ".LTR_coordinates.bed", "w")
    print(consensus, dico_length[consensus])
    dico_LTRs_by_consensus[consensus] = []
    for line in open(consensus + "/" + consensus + ".LTR_classif.tsv", "r"):
        elements = line.split("\t")
        if elements[10] == "no_match\n" or elements[9] == "classification":
            continue
        # elif elements[0] == "":
        #     dico_one_LTR = {}
        #     if Check5LTR and dico_one_LTR["5-LTR"] != {}:
        #         doc_one_LTR["5-LTR"] = {"contig", "contig-start", "contig-stop", "strand", "TE-start", "TE-stop"}
        #     elif Check3LTR and dico_one_LTR["3-LTR"] != {}:
        #         doc_one_LTR["3-LTR"] = {"contig", "contig-start", "contig-stop", "strand", "TE-start", "TE-stop"}
        else:
            if elements[0] != "":
                index = elements[0]
                contig = elements[1]
                status = elements[9]
            if Check5LTR(int(elements[16]), int(elements[17]), elements[14], dico_length[consensus]):
                # print(id)
                id = CountRepetitions(dico_LTRs_by_consensus[consensus], "%s_%s_5LTR" % (index, status))
                dico_LTRs_by_consensus[consensus].append(AddLTR(elements, id, contig))
                WriteLTR(getfasta_library, dico_LTRs_by_consensus[consensus][-1], dico_length[consensus], dico_length[contig])
            if Check3LTR(int(elements[16]), int(elements[17]), elements[14], dico_length[consensus]):       # pas de elif car les fragments complets ont les 2 LTR, donc à enregistrer 2 fois sous 2 id
                # print(id)
                id = CountRepetitions(dico_LTRs_by_consensus[consensus], "%s_%s_3LTR" % (index, status))
                dico_LTRs_by_consensus[consensus].append(AddLTR(elements, id, contig))
                if id == "23660_solo_3LTR":
                    print(dico_LTRs_by_consensus[consensus][-1])
                WriteLTR(getfasta_library, dico_LTRs_by_consensus[consensus][-1], dico_length[consensus], dico_length[contig])
    getfasta_library.close()            

c87_s2 {'complete': 10349, '5-LTR': 1504, 'RT': 0, 'RH': 0, 'PROT': 0, 'INT': 0, 'GAG': 0, '3-LTR': 1505}
2560_single_3LTR
2909_single_5LTR
2997_single_3LTR
3574_single_3LTR
3593_single_5LTR
5831_single_5LTR
5946_complete_5LTR
5946_complete_3LTR
6102_single_5LTR
6104_single_3LTR
6867_single_5LTR
7565_solo_3LTR
7784_single_5LTR
8131_single_3LTR
9683_solo_3LTR
10590_single_5LTR
13875_single_5LTR
14027_fragmented_3LTR
14027_fragmented_5LTR
15007_single_3LTR
17213_complete_5LTR
17213_complete_3LTR
17865_single_3LTR
17865_single_5LTR
19030_single_5LTR
19484_complete_5LTR
19484_complete_3LTR
19679_single_3LTR
19679_single_5LTR
20308_solo_3LTR
20376_single_5LTR
21452_solo_3LTR
21561_single_3LTR
22312_single_5LTR
22741_solo_5LTR
{'id': '23660_solo_3LTR', 'contig': 'NW_024067728.1', 'contig_start': '849746', 'contig_stop': '850246', 'strand': 'C', 'id_percentage': '80.1', 'TE_start': '10923', 'TE_stop': '9934'}
23660_solo_3LTR
23675_solo_3LTR
23690_solo_3LTR
25217_single_5LTR
26737_single_3LTR


Step 5 : Extraction of sequence for each LTR --> done with a bash script with the .bed file, then the alignments and phylogeny can be done on Jalview, and then the phylogeny on MrBayes

In [3]:
# Run Extract_LTR_for_phylogeny.sh

os.chdir(sequences_dir)

def RunMrBayes(consensus, count):
    os.chdir(consensus)
    nexus_file = "mrbayes_commands.nex"
    mrbayes_script = open(nexus_file, "w")
    file = "\texecute " + consensus + ".LTR_alignment.nex;"
    generations = "\tmcmc ngen=%s samplefreq=100 printfreq=100 diagnfreq=1000 nchains=5;" % str(count*1000)
    list_lines = ["#NEXUS\n",
                  "begin mrbayes;",
                  file,
                  "\tprset brlenspr=clock:uniform;",
                  "\tlset nst=2 rates=invgamma;",
                  generations,
                  "\tsump;",
                  "\tsumt;",
                  "end;"]
    mrbayes_script.write("\n".join(list_lines))
    mrbayes_script.close()
    os.system("mpirun -np 10 %s %s > mrbayes_run.log" % (mb, nexus_file))
    os.chdir("..")

count_file = open("../annotation_counts.tsv", "r")
list_counts = []
for line in count_file:
    if "paired" not in line:
        elements = line.split("\t")
        list_counts.append([elements[0], int(elements[-1][:-1])])

list_counts.sort(key=lambda x: (x[1]))
i=0
for consensus in list_counts:
    i+=1
    print(i, consensus[0])
    RunMrBayes(consensus[0], consensus[1])




1 c64_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


2 c399_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


3 c111_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


4 c82_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


5 c79_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


6 c60_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


7 c339_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


8 c64_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


9 c15_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


10 c339_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


11 c34_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


12 c411


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


13 c124_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


14 c384_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


15 c117_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


16 c79_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


17 c384_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


18 c46_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


19 c419


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


20 c53_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


21 c399_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


22 c46_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


23 c366


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


24 c320


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


25 c43_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


26 c60_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


27 c83_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


28 c117_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


29 c221_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


30 c43_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


31 c130_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


32 c83_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


33 c8_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


34 c133


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


35 c194


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


36 c332


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


37 c15_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


38 c71_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


39 c111_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


40 c347_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


41 c53_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


42 c62_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


43 c71_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


44 c204_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


45 c57_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


46 c244


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


47 c347_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


48 c144_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


49 c242_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


50 c62_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


51 c124_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


52 c144_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


53 c82_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


54 c304


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


55 c361


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


56 c103_s2


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


57 c336_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


58 c215_s1


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


59 c145


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


60 c226


hwloc/linux: Ignoring PCI device with non-16bit domain.
Pass --enable-32bits-pci-domain to configure to support such devices


Step X : Context of insertion

- gene context (most important)
- recombination context
- TF context
- GC content
- non B conformation
- maybe methylation and polymorphism with flowcell