# Comparing channels

- R: sequential
- G: reverse complementary
- B: not match at all

In [76]:
import ppscore
import pandas
import os
import numpy
from scipy.stats import spearmanr
from IPython.core.display import display, HTML
from ete3 import Tree

  from IPython.core.display import display, HTML


In [77]:
tree_path = "../../../data/trees"

In [78]:
def read_and_compare(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Euclidean Distance from distance matrices

In [80]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,1.299,0.449,0.3112,0.3338,0.489,0.4654,0.5317,1.1881,1.3925,1.3197,1.1396,1.2473
orthologs_myoglobin,0.0,0.9196,0.9044,1.7162,0.6623,0.2393,0.1566,0.7114,0.7984,0.7786,1.669,1.6106,1.8742,1.152,2.0703
orthologs_neuroglobin,0.0,1.8043,1.1514,3.6473,0.9806,0.832,0.5494,1.7328,1.392,2.6245,1.8423,4.8814,4.1627,1.7182,4.6535
orthologs_cytoglobin,0.0,6.3417,3.1095,7.7337,3.8332,1.8517,1.4808,4.2137,3.6954,5.0576,5.6928,9.6539,8.0081,3.2222,8.4934
orthologs_androglobin,0.0,8.2855,2.9601,1.4132,1.5084,1.4267,1.4327,1.4177,1.4237,1.4222,1.4911,1.4839,1.4994,1.5366,1.2119


In [56]:
channels = ("full", "red", "green", "blue")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,1.299,0.449,0.3112,0.3338
orthologs_myoglobin,0.0,0.9196,0.9044,1.7162,0.6623,0.2393,0.1566
orthologs_neuroglobin,0.0,1.8043,1.1514,3.6473,0.9806,0.832,0.5494
orthologs_cytoglobin,0.0,6.3417,3.1095,7.7337,3.8332,1.8517,1.4808
orthologs_androglobin,0.0,8.2855,2.9601,1.4132,1.5084,1.4267,1.4327


In [57]:
channels = ("full", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,1.299,1.1881,1.3925,1.3197,1.1396,1.2473
orthologs_myoglobin,0.0,0.9196,0.9044,1.7162,1.669,1.6106,1.8742,1.152,2.0703
orthologs_neuroglobin,0.0,1.8043,1.1514,3.6473,1.8423,4.8814,4.1627,1.7182,4.6535
orthologs_cytoglobin,0.0,6.3417,3.1095,7.7337,5.6928,9.6539,8.0081,3.2222,8.4934
orthologs_androglobin,0.0,8.2855,2.9601,1.4132,1.4911,1.4839,1.4994,1.5366,1.2119


In [58]:
channels = ("full", "red_green", "red_blue", "green_blue")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red_green,red_blue,green_blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,1.299,0.489,0.4654,0.5317
orthologs_myoglobin,0.0,0.9196,0.9044,1.7162,0.7114,0.7984,0.7786
orthologs_neuroglobin,0.0,1.8043,1.1514,3.6473,1.7328,1.392,2.6245
orthologs_cytoglobin,0.0,6.3417,3.1095,7.7337,4.2137,3.6954,5.0576
orthologs_androglobin,0.0,8.2855,2.9601,1.4132,1.4177,1.4237,1.4222


In [59]:
def read_and_linear_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(spearmanr(control_df.values.flatten(), df.values.flatten())[0], 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Linear Correlation from distance matrices

In [61]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_linear_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
orthologs_hemoglobin_beta,1.0,0.9999,0.9998,0.9937,0.995,0.988,0.9937,0.9918,0.9944,0.9932,0.9935,0.988,0.9937,0.9421
orthologs_myoglobin,1.0,0.9999,0.9998,0.9828,0.9787,0.9848,0.986,0.9795,0.982,0.9864,0.9736,0.9848,0.986,0.9592
orthologs_neuroglobin,1.0,0.9999,0.9992,0.9911,0.9903,0.9866,0.9912,0.9905,0.9912,0.9902,0.9901,0.9866,0.9912,0.9677
orthologs_cytoglobin,1.0,0.7775,0.8623,0.558,0.4682,0.4381,0.558,0.568,0.5216,0.5629,0.4893,0.4381,0.558,0.4826
orthologs_androglobin,1.0,0.9054,0.9769,-0.3708,0.7399,-0.5284,-0.5655,-0.4011,-0.4598,-0.4857,0.7434,0.4765,0.7784,0.2153


In [62]:
def read_and_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(ppscore.score(
                pandas.DataFrame({"x": control_df.values.flatten(), "y": df.values.flatten()}), "x", "y")["ppscore"], 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Predictive Power Score from distance matrices

In [63]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
orthologs_hemoglobin_beta,0.9968,0.9953,0.9944,0.9536,0.9539,0.9495,0.9532,0.9506,0.9539,0.9533,0.9482,0.9495,0.9532,0.0,0.9011
orthologs_myoglobin,0.9968,0.9954,0.9925,0.9335,0.9406,0.9271,0.941,0.9275,0.9354,0.9362,0.9257,0.9271,0.941,0.0,0.8855
orthologs_neuroglobin,0.9964,0.9876,0.9897,0.9621,0.9781,0.9577,0.9595,0.9628,0.9653,0.9587,0.9702,0.9577,0.9595,0.0,0.9357
orthologs_cytoglobin,0.9913,0.8524,0.8584,0.772,0.7444,0.768,0.7666,0.7729,0.7623,0.7658,0.7471,0.768,0.7666,0.0,0.777
orthologs_androglobin,0.9977,0.9773,0.9799,0.8458,0.905,0.8571,0.8602,0.8491,0.8521,0.8541,0.9097,0.84,0.9103,0.0,0.863


In [70]:
def read_and_tree_compare(dataset, channel="full", control="Control with Clustal Omega"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/{control}.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            tree = Tree(f"{path}/{file}", format=1)
            result = control_tree.compare(tree, unrooted=True)
            result_dict[basename] = round(result["norm_rf"], 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Robinson-foulds distance from generated Trees

In [81]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0833,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0,0.0,1.0,0.3333
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0,0.0,0.0,0.0,0.0,0.0833,1.0,0.1667
orthologs_neuroglobin,0.0,0.0,0.0,0.0833,0.3333,0.1667,0.0833,0.0833,0.0833,0.1667,0.3333,0.1667,0.0833,1.0,0.1667
orthologs_cytoglobin,0.0,0.6667,0.5,0.6667,0.5,0.8333,0.6667,0.5833,0.5833,0.5833,0.4167,0.8333,0.6667,1.0,0.5833
orthologs_androglobin,0.0,0.8333,0.5,1.0,1.0,0.9167,0.9167,0.9167,0.9167,0.9167,1.0,0.9167,0.8333,1.0,0.8333


In [72]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Global with Needleman-Wunsch"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0833,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0,0.0,1.0,0.3333
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0,0.0,0.0,0.0,0.0,0.0833,1.0,0.1667
orthologs_neuroglobin,0.0,0.0,0.0,0.0833,0.3333,0.1667,0.0833,0.0833,0.0833,0.1667,0.3333,0.1667,0.0833,1.0,0.1667
orthologs_cytoglobin,0.6667,0.0,0.25,0.3333,0.5833,0.5833,0.3333,0.4167,0.4167,0.5,0.5833,0.5833,0.3333,1.0,0.5
orthologs_androglobin,0.8333,0.0,0.4167,1.0,0.8333,1.0,1.0,1.0,1.0,1.0,0.8333,0.5,0.6667,1.0,0.5833


In [73]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Local with Smith–Waterman"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0833,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0,0.0,1.0,0.3333
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0,0.0,0.0,0.0,0.0,0.0833,1.0,0.1667
orthologs_neuroglobin,0.0,0.0,0.0,0.0833,0.3333,0.1667,0.0833,0.0833,0.0833,0.1667,0.3333,0.1667,0.0833,1.0,0.1667
orthologs_cytoglobin,0.5,0.25,0.0,0.25,0.4167,0.6667,0.25,0.3333,0.3333,0.5,0.4167,0.6667,0.25,1.0,0.5
orthologs_androglobin,0.5,0.4167,0.0,1.0,0.8333,1.0,1.0,1.0,1.0,1.0,0.8333,0.5833,0.6667,1.0,0.6667


In [66]:
def read_and_tree_compare_branches(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/Control with Clustal Omega.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            tree = Tree(f"{path}/{file}", format=1)
            result = control_tree.compare(tree, unrooted=True)
            result_dict[basename] = 1.0 - round(result["source_edges_in_ref"], 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Compatibility branch score from generated Trees

In [67]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare_branches(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0385,0.0,0.0,0.0,0.0385,0.0,0.0385,0.0,0.0,0.4615,0.1538
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0385,0.0,0.0385,0.0,0.0,0.0,0.0,0.0,0.0385,0.4615,0.0769
orthologs_neuroglobin,0.0,0.0,0.0,0.0385,0.1538,0.0769,0.0385,0.0385,0.0385,0.0769,0.1538,0.0769,0.0385,0.4615,0.0769
orthologs_cytoglobin,0.0,0.3077,0.2308,0.3077,0.2308,0.3846,0.3077,0.2692,0.2692,0.2692,0.1923,0.3846,0.3077,0.4615,0.2692
orthologs_androglobin,0.0,0.3846,0.2308,0.4615,0.4615,0.4231,0.4231,0.4231,0.4231,0.4231,0.4615,0.4231,0.3846,0.4615,0.3846


In [68]:
def read_and_compare_channels(dataset):
    result_dict = {}
    dfs = {}
    channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
    for channel in channels:
        result_dict[channel] = {}
        path = f"{tree_path}/{channel}/{dataset}"
        dfs[channel] = pandas.read_csv(f"{path}/MultiScale Structural Similarity Index Measure.csv", index_col=0)
    for channel in channels:
        control_df = dfs[channel]
        for c, df in dfs.items():
            result_dict[channel][c] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame.from_dict(result_dict, orient='index')
    result_df.index.name = "dataset"
    return result_df.style.background_gradient(axis=None, vmin=result_df.min().min(), vmax=result_df.max().max(), cmap="YlGnBu")

## Features Euclidean distance from distance matrices

In [69]:
for dataset in datasets:
    display(HTML(f"<center><h3>{dataset}</h3></center>"))
    display(read_and_compare_channels(dataset))

Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,0.8552,1.5897,1.6138,0.817,0.8412,0.7729,0.1198,0.1122,0.0572,2.4305,0.3012
red,0.8552,0.0,0.737,0.7606,0.0566,0.0376,0.0944,0.7431,0.9506,0.8766,1.5767,0.8294
green,1.5897,0.737,0.0,0.0439,0.7729,0.7498,0.817,1.4785,1.6825,1.6111,0.8412,1.5362
blue,1.6138,0.7606,0.0439,0.0,0.7977,0.7729,0.8412,1.5024,1.7079,1.634,0.817,1.5635
red_green,0.817,0.0566,0.7729,0.7977,0.0,0.0439,0.0502,0.706,0.9106,0.8398,1.6138,0.7903
red_blue,0.8412,0.0376,0.7498,0.7729,0.0439,0.0,0.075,0.7297,0.9372,0.8618,1.5897,0.8199
green_blue,0.7729,0.0944,0.817,0.8412,0.0502,0.075,0.0,0.663,0.867,0.7945,1.6578,0.7507
gray_r,0.1198,0.7431,1.4785,1.5024,0.706,0.7297,0.663,0.0,0.2249,0.1506,2.3188,0.3076
gray_g,0.1122,0.9506,1.6825,1.7079,0.9106,0.9372,0.867,0.2249,0.0,0.1317,2.5237,0.312
gray_b,0.0572,0.8766,1.6111,1.634,0.8398,0.8618,0.7945,0.1506,0.1317,0.0,2.451,0.3324


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,1.0586,1.9462,1.8586,1.0077,0.9198,0.9391,0.082,0.1269,0.1801,2.8656,0.5492
red,1.0586,0.0,0.8918,0.8051,0.0742,0.1497,0.1417,1.0086,0.9552,1.2197,1.8097,1.4368
green,1.9462,0.8918,0.0,0.1002,0.9391,1.0276,1.0077,1.8983,1.8397,2.105,0.9198,2.3048
blue,1.8586,0.8051,0.1002,0.0,0.8532,0.9391,0.9198,1.8113,1.7538,2.0155,1.0077,2.2164
red_green,1.0077,0.0742,0.9391,0.8532,0.0,0.1002,0.0832,0.9595,0.9018,1.1696,1.8586,1.3973
red_blue,0.9198,0.1497,1.0276,0.9391,0.1002,0.0,0.0384,0.873,0.8177,1.0785,1.9462,1.3103
green_blue,0.9391,0.1417,1.0077,0.9198,0.0832,0.0384,0.0,0.8939,0.8347,1.0978,1.9269,1.3307
gray_r,0.082,1.0086,1.8983,1.8113,0.9595,0.873,0.8939,0.0,0.1151,0.2494,2.8172,0.5815
gray_g,0.1269,0.9552,1.8397,1.7538,0.9018,0.8177,0.8347,0.1151,0.0,0.3006,2.7595,0.6341
gray_b,0.1801,1.2197,2.105,2.0155,1.1696,1.0785,1.0978,0.2494,0.3006,0.0,3.0232,0.4688


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,2.898,3.1722,3.362,1.9306,2.2643,1.1769,2.2385,1.3803,0.5454,5.2886,1.1382
red,2.898,0.0,1.1785,1.0842,1.1435,0.7601,2.0518,0.8897,4.215,3.4348,2.651,3.9714
green,3.1722,1.1785,0.0,0.2967,1.2779,1.0885,2.0461,1.8533,4.2938,3.6569,2.1469,4.0911
blue,3.362,1.0842,0.2967,0.0,1.4399,1.1793,2.2656,1.8516,4.5252,3.8581,1.929,4.3144
red_green,1.9306,1.1435,1.2779,1.4399,0.0,0.4021,0.9239,1.0529,3.1579,2.4422,3.3602,2.9325
red_blue,2.2643,0.7601,1.0885,1.1793,0.4021,0.0,1.3163,0.8662,3.5312,2.7854,3.0611,3.2992
green_blue,1.1769,2.0518,2.0461,2.2656,0.9239,1.3163,0.0,1.7025,2.2633,1.6226,4.1888,2.0578
gray_r,2.2385,0.8897,1.8533,1.8516,1.0529,0.8662,1.7025,0.0,3.6095,2.7715,3.5307,3.3524
gray_g,1.3803,4.215,4.2938,4.5252,3.1579,3.5312,2.2633,3.6095,0.0,0.8902,6.4408,0.3881
gray_b,0.5454,3.4348,3.6569,3.8581,2.4422,2.7854,1.6226,2.7715,0.8902,0.0,5.7871,0.6733


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,4.4142,6.4479,6.9552,3.6161,4.2049,2.8242,2.7293,2.2618,0.3931,10.5661,1.2192
red,4.4142,0.0,2.7807,3.0377,1.3397,0.833,2.2973,1.9869,6.5866,4.7443,6.4611,5.3891
green,6.4479,2.7807,0.0,0.6414,2.8642,2.4155,3.6583,4.6041,8.3137,6.7076,4.1569,7.1482
blue,6.9552,3.0377,0.6414,0.0,3.3448,2.8199,4.2016,4.9554,8.8772,7.2257,3.6129,7.6983
red_green,3.6161,1.3397,2.8642,3.3448,0.0,0.6799,1.0289,1.9791,5.6244,3.902,6.9518,4.4334
red_blue,4.2049,0.833,2.4155,2.8199,0.6799,0.0,1.6994,2.2041,6.2676,4.5028,6.3991,5.0687
green_blue,2.8242,2.2973,3.6583,4.2016,1.0289,1.6994,0.0,2.1029,4.6824,3.0616,7.8071,3.5125
gray_r,2.7293,1.9869,4.6041,4.9554,1.9791,2.2041,2.1029,0.0,4.9835,3.0903,8.439,3.8435
gray_g,2.2618,6.5866,8.3137,8.8772,5.6244,6.2676,4.6824,4.9835,0.0,1.9241,12.4706,1.3071
gray_b,0.3931,4.7443,6.7076,7.2257,3.902,4.5028,3.0616,3.0903,1.9241,0.0,10.8386,0.9227


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,0.2138,0.0155,0.0235,0.0052,0.0137,0.0104,0.1999,0.1818,0.2001,0.2363,0.2575
red,0.2138,0.0,0.2036,0.1965,0.21,0.2028,0.2073,0.0198,0.0394,0.0177,0.032,0.4157
green,0.0155,0.2036,0.0,0.0086,0.0105,0.0055,0.0053,0.1905,0.1722,0.1899,0.2249,0.2712
blue,0.0235,0.1965,0.0086,0.0,0.0184,0.0104,0.0137,0.1838,0.1658,0.1831,0.2174,0.279
red_green,0.0052,0.21,0.0105,0.0184,0.0,0.0087,0.0057,0.1963,0.1783,0.1963,0.2322,0.2624
red_blue,0.0137,0.2028,0.0055,0.0104,0.0087,0.0,0.0063,0.1894,0.1716,0.1893,0.2246,0.2701
green_blue,0.0104,0.2073,0.0053,0.0137,0.0057,0.0063,0.0,0.194,0.1756,0.1936,0.229,0.2663
gray_r,0.1999,0.0198,0.1905,0.1838,0.1963,0.1894,0.194,0.0,0.028,0.0164,0.0515,0.3978
gray_g,0.1818,0.0394,0.1722,0.1658,0.1783,0.1716,0.1756,0.028,0.0,0.0228,0.0652,0.3785
gray_b,0.2001,0.0177,0.1899,0.1831,0.1963,0.1893,0.1936,0.0164,0.0228,0.0,0.0436,0.4004
