# Comparing Similarity Matrices

In [83]:
import os
import pandas
import numpy
from scipy.spatial import distance_matrix
from Bio import SeqIO

In [126]:
path = f"../data/trees/indelible"
control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)

numpy.sqrt(numpy.sum((control_df.values - control_df.values)**2))

0.0

In [127]:
def read_and_compare(dataset):
    result_dict = {}
    path = f"../data/trees/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 5)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.index.name = "dataset"
    return result_df.sort_values(dataset, axis=1)

In [128]:
def compute_stats(dataset):
    fasta_file = f"../data/{dataset}.fasta.sanitized"
    seqs = SeqIO.parse(fasta_file, "fasta")
    seqs_len = [len(s.seq) for s in seqs]
    seqs_mean = numpy.mean(seqs_len)
    seqs_median = numpy.median(seqs_len)
    seqs_std = numpy.std(seqs_len)
    seqs_min = numpy.min(seqs_len)
    seqs_max = numpy.max(seqs_len)
        
    result_df = pandas.DataFrame(
        {"median": int(seqs_median),
         "mean": seqs_mean,
         "std": round(seqs_std, 2),
         "min": seqs_min,
         "max": seqs_max,
         "sample size": len(seqs_len)}, index=[dataset])
    result_df.index.name = "dataset"
    return result_df

## Indelible

In [129]:
dataset = "indelible"

In [130]:
read_and_compare(dataset)

Unnamed: 0_level_0,Control with Clustal Omega,Structural Similarity Index Measure,Universal Quality Index,Local with Smith–Waterman,Global with Needleman-Wunsch,MultiScale Structural Similarity Index Measure
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
indelible,0.0,4.86711,4.93568,8.71999,10.40937,19.48798


## Mioglobin Orthologues

In [136]:
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]

In [137]:
dfs = []
for dataset in datasets:
    try:
        dfs.append(read_and_compare(dataset))
    except:
        print(dataset)
        raise
pandas.concat(dfs)

Unnamed: 0_level_0,Control with Clustal Omega,MultiScale Structural Similarity Index Measure,Local with Smith–Waterman,Global with Needleman-Wunsch,Structural Similarity Index Measure,Universal Quality Index,Deep Search with Annoy
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
orthologs_hemoglobin_beta,0.0,0.15421,0.8937,0.90854,1.64049,1.79441,3.40182
orthologs_myoglobin,0.0,0.09536,0.9044,0.91958,1.78454,1.91983,3.46468
orthologs_neuroglobin,0.0,1.21763,1.15143,1.80429,4.34839,4.3852,4.95023
orthologs_cytoglobin,0.0,2.77428,3.10951,6.34167,8.01781,8.25613,8.37265
orthologs_androglobin,0.0,4.02214,2.96006,8.28549,9.76396,10.04081,8.60069


In [138]:
pandas.concat(dfs).to_clipboard()

### Dataset Statistics

In [134]:
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]

In [135]:
dfs = []
for dataset in datasets:
    try:
        dfs.append(compute_stats(dataset))
    except:
        print(dataset)
        raise
pandas.concat(dfs)

Unnamed: 0_level_0,median,mean,std,min,max,sample size
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
orthologs_hemoglobin_beta,441,441.0,0.0,441,441,15
orthologs_myoglobin,465,465.0,0.0,465,465,15
orthologs_neuroglobin,456,451.6,11.43,417,456,15
orthologs_cytoglobin,618,596.2,66.25,378,678,15
orthologs_androglobin,4929,4726.4,694.56,2148,5004,15
