In [1]:
import pandas
import os
import numpy
from IPython.core.display import display, HTML

In [2]:
def read_and_compare(dataset, channel="full"):
    result_dict = {}
    path = f"../data/trees/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

In [13]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]

In [14]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.drop_duplicates().T

Unnamed: 0_level_0,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,0.1542,0.8342,0.7896,0.8,0.4849,0.4957,0.4511
orthologs_myoglobin,0.0,0.9196,0.9044,0.0954,0.8046,0.7975,0.7811,0.451,0.4347,0.4279
orthologs_neuroglobin,0.0,1.8043,1.1514,1.2176,0.9789,0.9323,0.9554,0.6618,0.6385,0.652
orthologs_cytoglobin,0.0,6.3417,3.1095,2.7743,1.8149,1.7804,1.7816,1.5191,1.5174,1.5304
orthologs_androglobin,0.0,8.2855,2.9601,4.0221,1.0736,1.0347,1.0623,2.4228,2.4578,2.4023


In [15]:
def read_and_compare_channels(dataset):
    result_dict = {}
    dfs = {}
    for channel in channels:
        result_dict[channel] = {}
        path = f"../data/trees/{channel}/{dataset}"
        dfs[channel] = pandas.read_csv(f"{path}/MultiScale Structural Similarity Index Measure.csv", index_col=0)
    for channel in channels:
        control_df = dfs[channel]
        for c, df in dfs.items():
            result_dict[channel][c] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame.from_dict(result_dict, orient='index')
    result_df.index.name = "dataset"
    return result_df

In [16]:
for dataset in datasets:
    display(HTML(f"<center><h3>{dataset}</h3></center>"))
    display(read_and_compare_channels(dataset))

Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
full,0.0,0.6914,0.6469,0.6569,0.3408,0.3508,0.3064
red,0.6914,0.0,0.0479,0.0384,0.3508,0.3408,0.3857
green,0.6469,0.0479,0.0,0.0188,0.3064,0.2968,0.3408
blue,0.6569,0.0384,0.0188,0.0,0.3168,0.3064,0.3508
red_green,0.3408,0.3508,0.3064,0.3168,0.0,0.0188,0.0384
red_blue,0.3508,0.3408,0.2968,0.3064,0.0188,0.0,0.0479
green_blue,0.3064,0.3857,0.3408,0.3508,0.0384,0.0479,0.0


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
full,0.0,0.7273,0.7197,0.7035,0.3718,0.3558,0.348
red,0.7273,0.0,0.0237,0.0267,0.3558,0.3718,0.3798
green,0.7197,0.0237,0.0,0.0285,0.348,0.3653,0.3718
blue,0.7035,0.0267,0.0285,0.0,0.3323,0.348,0.3558
red_green,0.3718,0.3558,0.348,0.3323,0.0,0.0285,0.0267
red_blue,0.3558,0.3718,0.3653,0.348,0.0285,0.0,0.0237
green_blue,0.348,0.3798,0.3718,0.3558,0.0267,0.0237,0.0


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
full,0.0,1.7868,1.7258,1.769,0.8719,0.915,0.854
red,1.7868,0.0,0.062,0.027,0.915,0.8719,0.933
green,1.7258,0.062,0.0,0.0476,0.854,0.8109,0.8719
blue,1.769,0.027,0.0476,0.0,0.8975,0.854,0.915
red_green,0.8719,0.915,0.854,0.8975,0.0,0.0476,0.027
red_blue,0.915,0.8719,0.8109,0.854,0.0476,0.0,0.062
green_blue,0.854,0.933,0.8719,0.915,0.027,0.062,0.0


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
full,0.0,3.5723,3.5267,3.5294,1.7849,1.7875,1.7421
red,3.5723,0.0,0.0605,0.0616,1.7875,1.7849,1.8311
green,3.5267,0.0605,0.0,0.0217,1.7421,1.7395,1.7849
blue,3.5294,0.0616,0.0217,0.0,1.745,1.7421,1.7875
red_green,1.7849,1.7875,1.7421,1.745,0.0,0.0217,0.0616
red_blue,1.7875,1.7849,1.7395,1.7421,0.0217,0.0,0.0605
green_blue,1.7421,1.8311,1.7849,1.7875,0.0616,0.0605,0.0


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
full,0.0,3.2925,3.3489,3.3153,1.663,1.6295,1.6859
red,3.2925,0.0,0.0593,0.0262,1.6295,1.663,1.6067
green,3.3489,0.0593,0.0,0.0359,1.6859,1.7195,1.663
blue,3.3153,0.0262,0.0359,0.0,1.6523,1.6859,1.6295
red_green,1.663,1.6295,1.6859,1.6523,0.0,0.0359,0.0262
red_blue,1.6295,1.663,1.7195,1.6859,0.0359,0.0,0.0593
green_blue,1.6859,1.6067,1.663,1.6295,0.0262,0.0593,0.0
