# Dataset Statistics

In [55]:
import os
import pandas
import numpy
from Bio import SeqIO
from biotite.sequence.align import get_sequence_identity
from biotite.sequence.io import fasta

In [6]:
data_dir = "../../data"
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]

In [58]:
def compute_stats(dataset):
    fasta_file = f"{data_dir}/{dataset}.fasta.sanitized"
    seqs = SeqIO.parse(fasta_file, "fasta")
    seqs_len = [len(s.seq) for s in seqs]
    seqs_mean = numpy.mean(seqs_len)
    seqs_median = numpy.median(seqs_len)
    seqs_std = numpy.std(seqs_len)
    seqs_min = numpy.min(seqs_len)
    seqs_max = numpy.max(seqs_len)
    align = fasta.get_alignment(
        fasta.FastaFile.read(f"{data_dir}/trees/full/{dataset}/Control with Clustal Omega.fasta"))
    seqs_identity = get_sequence_identity(align)
        
    result_df = pandas.DataFrame(
        {"median": int(seqs_median),
         "mean": seqs_mean,
         "std": round(seqs_std, 2),
         "min": seqs_min,
         "max": seqs_max,
         "identity": seqs_identity,
         "sample size": len(seqs_len)}, index=[dataset])
    result_df.index.name = "dataset"
    return result_df

In [59]:
dfs = []
for dataset in datasets:
    try:
        dfs.append(compute_stats(dataset))
    except:
        print(dataset)
        raise
pandas.concat(dfs)

Unnamed: 0_level_0,median,mean,std,min,max,identity,sample size
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
orthologs_hemoglobin_beta,441,441.0,0.0,441,441,0.750567,15
orthologs_myoglobin,465,465.0,0.0,465,465,0.763441,15
orthologs_neuroglobin,456,451.6,11.43,417,456,0.610422,15
orthologs_cytoglobin,618,596.2,66.25,378,678,0.396465,15
orthologs_androglobin,4929,4726.4,694.56,2148,5004,0.605203,15
