In [7]:
# library import
from utils import *
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

## Length check

In [8]:
def cons_from_deepte_parser(cons_file):
    df = pd.DataFrame(columns=["Seq name", "Annotation", "Length"])
    
    c = 0
    for seq_record in SeqIO.parse(cons_file, "fasta"):
        
        name = seq_record.name
        order = name.split("__")[1]
        if "_LTR" in order:
            order = "LTR"
        elif "_MITE" in order:
            order = "MITE"
        elif "_DNA" in order:
            order = "DNA"
        elif "_Helitron" in order:
            order = "DNA"
        elif "_SINE" in order:
            order = "SINE"
        elif "_LINE" in order:
            order = "LINE"
        elif "_nLTR" in order:
            order = "non_LTR"
        else:
            continue
        
        seq_len = len(seq_record.seq)
        df2 = pd.DataFrame({'Seq name': [seq_record.name],'Annotation': [order], 'Length': [seq_len]})
        df = pd.concat([df, df2], ignore_index = True, axis = 0)
        c += 1
    
    return df

In [9]:
RM2_data = cons_from_deepte_parser("../../DeepTE/RM2/results/opt_DeepTE.fasta")
EDTA_data = cons_from_deepte_parser("../../DeepTE/EDTA/results/opt_DeepTE.fasta")
MITE_data = cons_from_deepte_parser("../../DeepTE/MITE/results/opt_DeepTE.fasta")
repbase_data = parse_repbase("../test/raw_lib/RepBase.fasta")

FileNotFoundError: [Errno 2] No such file or directory: '../../DeepTE/RM2/results/opt_DeepTE.fasta'

In [10]:
dict_data = {"RepeatModeler2": (RM2_data,"#992339"), "EDTA": (EDTA_data, "#15616d"), "MITE-Tracker": (MITE_data, "#FF8811"), "RepBase": (repbase_data, "#14BDEB")}

NameError: name 'RM2_data' is not defined

In [3]:
annot_lst = [("LTR"), ("DNA"), ("LINE"), ("SINE"), ("non_LTR"), ("MITE")]
tool_lst = ["RepeatModeler2", "EDTA", "MITE-Tracker", "RepBase"]
fig = make_subplots(rows=6, cols=1, subplot_titles=tuple(annot_lst), vertical_spacing=0.05, x_title='Length', y_title='Count')
annot_lst = [("LTR", 1400), ("DNA", 800), ("LINE", 1600), ("SINE", 160), ("non_LTR", 1400), ("MITE", 160)]

i = 0
for c in range(1):
    for r in range(6):
        order = annot_lst[i][0]
        print(order)
        
        for tool in tool_lst:
            df = dict_data[tool][0]            
            lengths = df[df["Annotation"]==order]["Length"]
            print(tool + ":\t" + str(len(lengths)))
            
            trace = go.Histogram(
                x=df[df["Annotation"]==order]["Length"],
                marker_color=dict_data[tool][1],
                name=tool,
                histnorm='percent',
                xbins=dict(
                       size=annot_lst[i][1]
                    )            
                )
            fig.append_trace(trace, row=r+1, col=c+1)            
        i += 1    
    
    
fig.layout['barmode'] = 'group'
names = set()
fig.for_each_trace(
    lambda trace:
        trace.update(showlegend=False)
        if (trace.name in names) else names.add(trace.name))
fig['layout']['xaxis'].update(range=[0,7000], tickvals = [700, 2100, 3500, 4900, 6300], ticktext=['0-1400', '1400-2800', '2800-4200', '4200-5600', '5600-7000'])
fig['layout']['xaxis2'].update(range=[0,4000], tickvals = [400, 1200, 2000, 2800, 3600], ticktext=['0-800', '800-1600', '1600-2400', '2400-3200', '3200-4000'])
fig['layout']['xaxis3'].update(range=[0,8000], tickvals = [800, 2400, 4000, 5600, 7200], ticktext=['0-1600', '1600-3200', '3200-4800', '4800-6400', '6400-8000'])
fig['layout']['xaxis4'].update(range=[0,800], tickvals = [80, 240, 400, 560, 720], ticktext=['0-160', '160-320', '320-480', '480-640', '640-800'])
fig['layout']['xaxis5'].update(range=[0,7000], tickvals = [700, 2100, 3500, 4900, 6300], ticktext=['0-1400', '1400-2800', '2800-4200', '4200-5600', '5600-7000'])
fig['layout']['xaxis6'].update(range=[0,800], tickvals = [80, 240, 400, 560, 720], ticktext=['0-160', '160-320', '320-480', '480-640', '640-800'])

fig.update_layout(
    autosize=False,
    width=900,
    height=1200,
    #title="distribution of the lengths of the elements by order",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
)
fig.show()
fig.write_image("/home/lerat/Téléchargements/length_dist.svg")

NameError: name 'make_subplots' is not defined

## Redundancy check

In [56]:
# files
RM2 = "/home/tommaso/TE_Aalb/test/rendundancy_check/consensi_RM2.fasta.clstr"
MITE = "/home/tommaso/TE_Aalb/test/rendundancy_check/consensi_MITE.fasta.clstr"
EDTA = "/home/tommaso/TE_Aalb/test/rendundancy_check/consensi_EDTA.fasta.clstr"

In [50]:
def calc_redundancy(cdhit):
    """
    Parse cd-hit-est output file having clusters. It returns a dict where the key is the cluster name and the 
    value the list of sequences ID in the cluster.
    """

    dico = {} # key: representative seq, value: list of the sequences in the cluster
    list_seq = [] # value for the dict

    with open(cdhit, "r") as f:
        lines = f.readlines()
        for line in lines:
            if line.startswith(">"): # begin of a new cluster
                if len(list_seq) != 0:
                    dico[k] = list_seq
                    list_seq = []
                k = line.replace("\n", "").replace(">", "")
            else:
                seq_line = line.split()
                if seq_line[3] == "*":
                    seq_id = seq_line[3]
                else:
                    seq_id = seq_line[4].split("/")[-1]
                list_seq.append([seq_line[1].replace("nt,",""), seq_line[2].replace("...", ""), seq_id])
    c = 0 # counter redundancy
    for k in dico.keys():
        for seq in dico[k]:
            if seq[-1] == "*":
                center_length = seq[0]
        for seq in dico[k]:
            if seq[-1] == "100.00%" and seq[0] == center_length:
                c += 1
    return c

In [57]:
RM2_red = calc_redundancy(RM2)
EDTA_red = calc_redundancy(EDTA)
MITE_red = calc_redundancy(MITE)

In [60]:
print(f"""
RepeatModeler2: {RM2_red}
EDTA: {EDTA_red}
MITE-Tracker: {MITE_red}
""")


RepeatModeler2: 6263
EDTA: 0
MITE-Tracker: 0

