# Repeated fraction of the genome

Estimation of the repeated fraction of the *Aedes albopictus* based on the number of sequences in each cluster (from the results of `cd-hit-est`).

In [1]:
import plotly.express as px

In [2]:
def parse_cdhit(cdhit):
    """
    Parse cd-hit-est output file having clusters. It returns a dict where the key is the representative sequence ID of the
    cluster and the value the list of other sequences ID in the cluster.
    """

    dico = {} # key: representative seq, value: list of the sequences in the cluster
    list_seq = [] # value for the dict

    with open(cdhit, "r") as f:
        lines = f.readlines()
        for line in lines:
            if line.startswith(">"): # begin of a new cluster
                try:                
                    dico[key] = list_seq # add the representative sequence as key 
                    list_seq = [] # reset list of sequences
                except:            
                    pass
            else: # within the cluster
                if line.endswith("*\n"): # representative sequence of the cluster
                    key = line.split(">")[1].split("...")[0]
                else: # other sequences
                    seq_name = line.split(">")[1].split("...")[0]
                    list_seq.append(seq_name)        
    return dico

In [4]:
cdhit_dict = parse_cdhit("../test/cdhit/consensi.fasta.clstr")

In [5]:
nb_clst_dict = {}
# init dict
for i in range(1,21):
    nb_clst_dict[str(i)] = 0

for clst in cdhit_dict:
    clst_size = len(cdhit_dict[clst]) + 1
    for i in range(1,21):
        if clst_size >= i:
            nb_clst_dict[str(i)] = nb_clst_dict[str(i)] + 1

In [14]:
fig = px.histogram(x=nb_clst_dict.keys(), y=nb_clst_dict.values(), title="Nb of cluster by a minimal number of sequences", nbins=20)
fig['layout']['yaxis'].update(tickvals = [437731, 160751, 87058, 60873, 35653, 14178])
fig.update_layout(
    paper_bgcolor = 'rgba(0,0,0,0)',
    plot_bgcolor = 'rgba(0,0,0,0)',
    xaxis_title = "Minimal nb of sequences",
    yaxis_title = "Nb of clusters"
)
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='rgba(0, 0, 255, 1)')
fig.show()