# Extract_LTRs.ipynb
Script used to extract LTRs from consensus_sequences

### Step 0 : Paths and modules

In [2]:
#%% Files

# path to TE consensus sequences
consensus_seqs = '/home/valentin-grenet/Bureau/Données/Resources_yann/Final_library.fasta'
# path to results directory
final_dir = "/home/valentin-grenet/Bureau/Données/TE_sequences"
# path to inventory of blast matches used for LTR sequences
blast_file = "/home/valentin-grenet/Bureau/Données/TE_sequences/blast_results_filtered.tsv"
# path to LTR informations (length, coordinates)
LTR_stats = "/home/valentin-grenet/Bureau/Données/TE_sequences/LTR_sequences_results.tsv"

#%% Modules

import os
import subprocess

from Bio import SeqIO                           # Used to have a uniform interface for input and output sequence file formats
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

### Step 1 : Extract each consensus sequences and LTR coordinates in individual fasta files

In [3]:
def blastn(query, subject, outfile=False):
    '''Execute a blastn query and return the results'''
    cmd = ['blastn',
           '-query', query,
           '-subject', subject,
           '-outfmt', '6 length pident qseqid qstart qend sseqid sstrand sstart send sseq evalue'
           ]

    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    output = proc.stdout.read()
    blast_out = output.splitlines()
    
    if outfile:
        with open(outfile, 'w') as f:
            for line in blast_out:
                f.write(line + '\n')
    
    return [line.split(b'\t') for line in output.splitlines()]          # added a b in line.split to turn in bytes object

os.chdir(final_dir)
TEs = {seq_record.id: seq_record for seq_record in SeqIO.parse(consensus_seqs, "fasta")}
TE_info = {}
count_TE = 0

blast_list = open(blast_file, "w")
headers = ["aln_len", "%_id", "consensus", "query_start", "query_stop","target_consensus","strand","target_start","target_stop","sequence","e-value"]
blast_list.write('\t'.join(headers) + '\n')

for TE in TEs:
    count_TE += 1
    print('Annotating ' + TE + ', sequence length : ' + str(len(TEs[TE])))           # Creating consensus directory
    basename = TE.split('#')[0]
    os.mkdir(basename)
    os.chdir(basename)
    SeqIO.write(TEs[TE], basename + '.fasta', 'fasta')                                                  # fichier fasta pour chaque séquence
    
    selfblast = blastn(basename + '.fasta', basename + '.fasta')[1:]                                          # selfblast sur la séquence consensus
    selfblast.sort(key=lambda x: (int(x[7])))
    
    i=0
    for result in selfblast:
        result.pop(2)
        print(i)
        print(result)
        i+=1
    print("end of blast\n")
    input("Test " + str(count_TE))
    LTR = input(basename)
    
    blast_list.write(str(b'\t'.join(selfblast[int(LTR)]) + b'\n', encoding="utf-8"))
    os.chdir('..')

blast_list.close()

Annotating consensus_Cluster_100_subfam_1#LTR:Copia, sequence length : 10626
0
[b'1373', b'100.000', b'1', b'1373', b'consensus_Cluster_100_subfam_1#LTR:Copia', b'plus', b'1', b'1373', b'TGAGGATGATAGTGGTATTTTGATGATTAATACAATAATTAAGAGTATGTNNNNNNNNNNNTACAAGTTAATTCGATACATCATTNATAAGTCTGTCACTCTCNGTACATTAGTATAATGAGATAAAGATGCATTTCANNNNNNNNNNNNNNNNNNNNATTTGNGATAGNAAAGGGATAGTGAATTCTAAATTCNAATTNNGCAAAGNTTCAAGTGANANNTACTCTTAAGNGGACAATAGTAATTTTCATTGTTNNGAGTATGTAGCACNANCATGNCATANATTCAAAATTGNTTNAAGNNTATTTCATTGATTATATGGATGAATCTAACCTTAAGTTGCAGCAAAGTGTGGATTGAGTCGACCCCAAGATTGATTGAGTCGACCCCGGCACATCAAGAAACCATCTGGCACACTTCTGCAAAAATGGCACGGATGAACAGTAGTTGGGTCGACCCAAGNAAAGGATGAGTCGACCCAATCCTCAAGCCTCAAGAAAACAGTTCTCTGGAAACCCTGAGAGGGTCGACCCAAATGAAACTTGAGTCGACCCAAACTTGAGTCGACCCCAAGTTAACATGAGTCGACCCAAAGGCAATGACATTCAAAAATAGGTTCTCTGGAAACCCTGAGAGGGTCGACCCAAGTGGAAGTTGAGTCGACCCAACTGAACCTTGAGTCGACCCAAAAANGTGTTGAGTCGACCCAAGTGAAGGAAGGCTGAAATGCAAGGTTCTGTGGTTNCTGAGAGGGTCGACCCCANNNNNNNNNNNNNNNNNNNNNGTAAANGTTGAGTCGACCCAAGTGAAAGT

### Step 2 : Extract LTR sequences

In [5]:
def WriteLTR(start_3, consensus, start_5, TEs):
    basename = consensus.split('#')[0]
    os.chdir(basename)
    len_TE = len(TEs[consensus])
    len_LTR = len_TE - start_3 + (start_5 - 1)
    start_3 = len_TE-len_LTR
    SeqIO.write(TEs[consensus][:len_LTR], basename + '.5LTR.fasta', 'fasta')
    SeqIO.write(TEs[consensus][start_3:], basename + '.3LTR.fasta', 'fasta')
    os.chdir("..")
    return [basename, str(len_LTR), str(start_3), str(len_TE)]

os.chdir(final_dir)
TEs = {seq_record.id: seq_record for seq_record in SeqIO.parse(consensus_seqs, "fasta")}
LTRs = []
for line in open(blast_file, "r"):
    if "aln_len" not in line:
        columns = line.split("\t")
        LTRs.append(WriteLTR(int(columns[2]), columns[4], int(columns[6]), TEs))

LTR_infos = open(LTR_stats, "w")
headers = ["consensus", "length_LTR", "3-LTR_start", "length_consensus"]
LTR_infos.write("\t".join(headers) + "\n")
for LTR in LTRs:
    print(LTR)
    LTR_infos.write("\t".join(LTR) + "\n")
LTR_infos.close()
    

['consensus_Cluster_100_subfam_1', '1687', '8939', '10626']
['consensus_Cluster_100_subfam_2', '1748', '9015', '10763']
['consensus_Cluster_101_subfam_1', '816', '11827', '12643']
['consensus_Cluster_101_subfam_2', '816', '11773', '12589']
['consensus_Cluster_103_subfam_1', '868', '11988', '12856']
['consensus_Cluster_103_subfam_2', '899', '11829', '12728']
['consensus_Cluster_111_subfam_1', '993', '11498', '12491']
['consensus_Cluster_111_subfam_2', '1048', '11891', '12939']
['consensus_Cluster_117_subfam_1', '1216', '6863', '8079']
['consensus_Cluster_117_subfam_2', '1370', '7015', '8385']
['consensus_Cluster_124_subfam_1', '1157', '11426', '12583']
['consensus_Cluster_124_subfam_2', '1017', '11468', '12485']
['consensus_Cluster_130_subfam_2', '931', '4950', '5881']
['consensus_Cluster_144_subfam_1', '981', '11026', '12007']
['consensus_Cluster_144_subfam_2', '965', '11030', '11995']
['consensus_Cluster_157_subfam_1', '1903', '9936', '11839']
['consensus_Cluster_157_subfam_2', '1843'