# Dataset Statistics

In [1]:
import os
import pandas
import numpy
from Bio import SeqIO
from biotite.sequence.align import get_sequence_identity
from biotite.sequence.io import fasta
from itertools import combinations

In [2]:
data_dir = "../../data"
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin", "indelible"]

In [3]:
def lcs(first, second):
    m = len(first.seq)
    n = len(second.seq)
    counter = [[0]*(n+1) for x in range(m+1)]
    longest = 0
    lcs_str = ""
    for i in range(m):
        for j in range(n):
            if first.seq[i] == second.seq[j]:
                c = counter[i][j] + 1
                counter[i+1][j+1] = c
                if c > longest:
                    lcs_str = str(first.seq[i-c+1:i+1])
                    longest = c
    return lcs_str

In [4]:
def repeats(s, k=3):
    max_repeats = 0
    for i in range(0, len(s.seq), k):
        max_repeats = max(max_repeats, s.seq.count(s.seq[i:i+k]))
    return max_repeats

In [5]:
def compute_stats(dataset):
    fasta_file = f"{data_dir}/{dataset}.fasta.sanitized"
    seqs = list(SeqIO.parse(fasta_file, "fasta"))
    lcs_seqs = [len(lcs(s1, s2)) for (s1, s2) in combinations(seqs, 2)]
    repeats_len = [repeats(s) for s in seqs]
    seqs_len = [len(s.seq) for s in seqs]
    seqs_mean = numpy.mean(seqs_len)
    seqs_median = numpy.median(seqs_len)
    seqs_std = numpy.std(seqs_len)
    seqs_min = numpy.min(seqs_len)
    seqs_max = numpy.max(seqs_len)
    align = fasta.get_alignment(
        fasta.FastaFile.read(f"{data_dir}/trees/full/{dataset}/Control with Clustal Omega.fasta"))
    seqs_identity = get_sequence_identity(align)
    gaps = [seq.count("-") for seq in fasta.FastaFile.read(f"{data_dir}/trees/full/{dataset}/Control with Clustal Omega.fasta").values()]
    result_df = pandas.DataFrame(
        {"median": int(seqs_median),
         "mean": seqs_mean,
         "std": round(seqs_std, 2),
         "min": seqs_min,
         "max": seqs_max,
         "identity": seqs_identity,
         "median_rep": numpy.median(repeats_len),
         "mean_rep": numpy.mean(repeats_len),
         "std_rep": numpy.std(repeats_len),
         "min_rep": numpy.min(repeats_len),
         "max_rep": numpy.max(repeats_len),
         "median_lcs": numpy.median(lcs_seqs),
         "mean_lcs": numpy.mean(lcs_seqs),
         "std_lcs": numpy.std(lcs_seqs),
         "min_lcs": numpy.min(lcs_seqs),
         "max_lcs": numpy.max(lcs_seqs),
         "with_gaps": sum([1 if g else 0 for g in gaps]),
         "median_gaps": numpy.median(gaps),
         "mean_gaps": numpy.mean(gaps),
         "std_gaps": numpy.std(gaps),
         "min_gaps": numpy.min(gaps),
         "max_gaps": numpy.max(gaps),
         "sample size": len(seqs_len)}, index=[dataset])
    result_df.index.name = "dataset"
    return result_df

In [6]:
dfs = []
for dataset in datasets:
    try:
        dfs.append(compute_stats(dataset))
    except:
        print(dataset)
        raise
metrics_df = pandas.concat(dfs)

In [7]:
pandas.set_option('display.max_columns', None)
metrics_df

Unnamed: 0_level_0,median,mean,std,min,max,identity,median_rep,mean_rep,std_rep,min_rep,max_rep,median_lcs,mean_lcs,std_lcs,min_lcs,max_lcs,with_gaps,median_gaps,mean_gaps,std_gaps,min_gaps,max_gaps,sample size
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
orthologs_hemoglobin_beta,441,441.0,0.0,441,441,0.750567,25.0,24.533333,1.257864,21,26,88.0,98.419048,88.533532,30,441,0,0.0,0.0,0.0,0,0,15
orthologs_myoglobin,465,465.0,0.0,465,465,0.763441,22.0,21.533333,1.203698,19,23,65.0,74.247619,61.949101,22,465,0,0.0,0.0,0.0,0,0,15
orthologs_neuroglobin,456,451.6,11.43,417,456,0.610422,24.0,23.133333,1.820867,19,25,77.0,107.828571,92.300901,32,456,2,0.0,4.4,11.429786,0,39,15
orthologs_cytoglobin,618,596.2,66.25,378,678,0.396465,25.0,24.333333,1.813529,20,27,113.0,125.209524,70.959102,20,404,15,66.0,87.8,66.249226,6,306,15
orthologs_androglobin,4929,4726.4,694.56,2148,5004,0.605203,190.0,179.0,31.385772,64,194,130.0,175.838095,194.554393,43,1477,15,125.0,327.6,694.563057,50,2906,15
indelible,3000,3000.0,0.0,3000,3000,0.0,64.0,63.375,3.336821,57,71,11.0,11.365385,1.056643,10,17,40,3302.0,3302.0,0.0,3302,3302,40
