# Dataset Statistics

In [1]:
import os
import pandas
import numpy
from Bio import SeqIO
from biotite.sequence.align import get_sequence_identity
from biotite.sequence.io import fasta
from itertools import combinations
pandas.set_option('display.max_columns', None)

In [2]:
data_dir = "../../data"
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin", "indelible"]

In [3]:
def lcs(first, second):
    m = len(first.seq)
    n = len(second.seq)
    counter = [[0]*(n+1) for x in range(m+1)]
    longest = 0
    lcs_str = ""
    for i in range(m):
        for j in range(n):
            if first.seq[i] == second.seq[j]:
                c = counter[i][j] + 1
                counter[i+1][j+1] = c
                if c > longest:
                    lcs_str = str(first.seq[i-c+1:i+1])
                    longest = c
    return lcs_str

In [4]:
def repeats(s, k=3):
    max_repeats = 0
    for i in range(0, len(s.seq), k):
        max_repeats = max(max_repeats, s.seq.count(s.seq[i:i+k]))
    return max_repeats

## Nucleotide

In [9]:
def compute_stats(dataset):
    fasta_file = f"{data_dir}/{dataset}.fasta.N.sanitized"
    seqs = list(SeqIO.parse(fasta_file, "fasta"))
    lcs_seqs = [len(lcs(s1, s2)) for (s1, s2) in combinations(seqs, 2)]
    repeats_len = [repeats(s) for s in seqs]
    seqs_len = [len(s.seq) for s in seqs]
    seqs_mean = numpy.mean(seqs_len)
    seqs_median = numpy.median(seqs_len)
    seqs_std = numpy.std(seqs_len)
    seqs_min = numpy.min(seqs_len)
    seqs_max = numpy.max(seqs_len)
    align = fasta.get_alignment(
        fasta.FastaFile.read(f"{data_dir}/trees/N/full/{dataset}/Control with Clustal Omega.fasta"))
    seqs_identity = get_sequence_identity(align)
    gaps = [seq.count("-") for seq in fasta.FastaFile.read(f"{data_dir}/trees/N/full/{dataset}/Control with Clustal Omega.fasta").values()]
    result_df = pandas.DataFrame(
        {"median": int(seqs_median),
         "mean": seqs_mean,
         "std": round(seqs_std, 2),
         "min": seqs_min,
         "max": seqs_max,
         "identity": seqs_identity,
         "median_rep": numpy.median(repeats_len),
         "mean_rep": numpy.mean(repeats_len),
         "std_rep": numpy.std(repeats_len),
         "min_rep": numpy.min(repeats_len),
         "max_rep": numpy.max(repeats_len),
         "median_lcs": numpy.median(lcs_seqs),
         "mean_lcs": numpy.mean(lcs_seqs),
         "std_lcs": numpy.std(lcs_seqs),
         "min_lcs": numpy.min(lcs_seqs),
         "max_lcs": numpy.max(lcs_seqs),
         "with_gaps": sum([1 if g else 0 for g in gaps]),
         "median_gaps": numpy.median(gaps),
         "mean_gaps": numpy.mean(gaps),
         "std_gaps": numpy.std(gaps),
         "min_gaps": numpy.min(gaps),
         "max_gaps": numpy.max(gaps),
         "sample size": len(seqs_len)}, index=[dataset])
    result_df.index.name = "dataset"
    return result_df

In [None]:
dfs = []
for dataset in datasets:
    try:
        dfs.append(compute_stats(dataset))
    except:
        print(dataset)
        raise
pandas.concat(dfs)

## Protein

In [None]:
def compute_protein_stats(dataset):
    fasta_file = f"{data_dir}/{dataset}.fasta.P.sanitized"
    seqs = list(SeqIO.parse(fasta_file, "fasta"))
    seqs_len = [len(s.seq) for s in seqs]
    seqs_mean = numpy.mean(seqs_len)
    seqs_median = numpy.median(seqs_len)
    seqs_std = numpy.std(seqs_len)
    seqs_min = numpy.min(seqs_len)
    seqs_max = numpy.max(seqs_len)
    lcs_seqs = [len(lcs(s1, s2)) for (s1, s2) in combinations(seqs, 2)]
    result_df = pandas.DataFrame(
        {"median": int(seqs_median),
         "mean": seqs_mean,
         "std": round(seqs_std, 2),
         "min": seqs_min,
         "max": seqs_max,
         "median_lcs": numpy.median(lcs_seqs),
         "mean_lcs": numpy.mean(lcs_seqs),
         "std_lcs": numpy.std(lcs_seqs),
         "min_lcs": numpy.min(lcs_seqs),
         "max_lcs": numpy.max(lcs_seqs),
         "sample size": len(seqs_len)}, index=[dataset])
    result_df.index.name = "dataset"
    return result_df

In [None]:
dfs = []
for dataset in datasets:
    try:
        dfs.append(compute_protein_stats(dataset))
    except:
        print(dataset)
        raise
pandas.concat(dfs)