# Comparing channels

- R: sequential
- G: reverse complementary
- B: inverse

In [1]:
import ppscore
import pandas
import os
import numpy
from scipy.stats import spearmanr
from IPython.core.display import display, HTML
from ete3 import Tree

  from IPython.core.display import display, HTML


In [2]:
tree_path = "../../../data/trees"

In [3]:
def read_and_compare(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Euclidean Distance from distance matrices

In [4]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["indelible", "orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,full,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,Structural Similarity Index Measure,Control with Clustal Omega,Local with Smith–Waterman,red,Universal Quality Index,Deep Search with Annoy,...,Deep Search with Annoy,Structural Similarity Index Measure,gray_max,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,gray_mean,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
indelible,19.488,4.9357,10.4094,6.3518,4.8671,0.0,8.72,30.0765,21.2013,13.5759,...,5.1742,5.9398,14.5535,11.0862,5.7711,5.4829,19.3268,9.9296,6.5418,4.9414
orthologs_hemoglobin_beta,0.1542,1.7944,0.9085,3.4018,1.6405,0.0,0.8937,0.9769,0.2275,2.2404,...,5.8557,2.4685,0.3889,0.9606,4.0439,2.2516,0.0754,0.9427,4.0765,1.8407
orthologs_myoglobin,0.0954,1.9198,0.9196,3.4647,1.7845,0.0,0.9044,0.9702,0.1829,2.4713,...,5.1548,2.7135,0.5494,1.0809,4.7645,2.4729,0.1965,1.1184,4.5982,2.1081
orthologs_neuroglobin,1.2176,4.3852,1.8043,4.9502,4.3484,0.0,1.1514,1.3372,0.7065,3.3366,...,5.7259,5.5042,1.8501,2.1755,5.6379,4.9408,1.1943,2.4028,5.6275,4.4653
orthologs_cytoglobin,2.7743,8.2561,6.3417,8.3726,8.0178,0.0,3.1095,2.448,1.5906,6.1292,...,9.6924,9.853,4.0279,4.1343,8.7963,9.0304,2.7103,4.5513,8.8261,8.0407
orthologs_androglobin,4.0221,10.0408,8.2855,8.6007,9.764,0.0,2.9601,0.9328,2.6475,6.7286,...,11.0382,,5.6724,5.8893,9.5528,,3.7766,6.3139,9.3038,


In [5]:
channels = ("full", "red", "green", "blue")
datasets = ["indelible", "orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,full,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,Structural Similarity Index Measure,Control with Clustal Omega,Local with Smith–Waterman,red,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,green,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,blue,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
indelible,19.488,4.9357,10.4094,6.3518,4.8671,0.0,8.72,30.0765,21.2013,13.5759,21.2583,27.7266,21.152,5.3203,21.8397,27.7222,21.1548,6.2651,21.8372
orthologs_hemoglobin_beta,0.1542,1.7944,0.9085,3.4018,1.6405,0.0,0.8937,0.9769,0.2275,2.2404,0.1363,0.7896,0.1463,3.0701,0.199,0.8,0.1691,3.8291,0.2245
orthologs_myoglobin,0.0954,1.9198,0.9196,3.4647,1.7845,0.0,0.9044,0.9702,0.1829,2.4713,0.1045,0.7975,0.138,2.5298,0.1797,0.7811,0.1259,3.0248,0.1721
orthologs_neuroglobin,1.2176,4.3852,1.8043,4.9502,4.3484,0.0,1.1514,1.3372,0.7065,3.3366,0.8911,0.9323,0.7349,3.5526,0.732,0.9554,0.7217,3.7993,0.7021
orthologs_cytoglobin,2.7743,8.2561,6.3417,8.3726,8.0178,0.0,3.1095,2.448,1.5906,6.1292,1.868,1.7804,1.6181,6.5167,1.5885,1.7816,1.6206,7.5317,1.5757
orthologs_androglobin,4.0221,10.0408,8.2855,8.6007,9.764,0.0,2.9601,0.9328,2.6475,6.7286,,1.0347,2.6778,11.8461,5.612,1.0623,2.6542,11.0361,5.6163


In [6]:
channels = ("full", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["indelible", "orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,full,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,Structural Similarity Index Measure,Control with Clustal Omega,Local with Smith–Waterman,gray_r,Universal Quality Index,Deep Search with Annoy,...,Deep Search with Annoy,Structural Similarity Index Measure,gray_max,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,gray_mean,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
indelible,19.488,4.9357,10.4094,6.3518,4.8671,0.0,8.72,19.5789,4.9113,8.1126,...,5.1742,5.9398,14.5535,11.0862,5.7711,5.4829,19.3268,9.9296,6.5418,4.9414
orthologs_hemoglobin_beta,0.1542,1.7944,0.9085,3.4018,1.6405,0.0,0.8937,0.2307,1.6217,4.1546,...,5.8557,2.4685,0.3889,0.9606,4.0439,2.2516,0.0754,0.9427,4.0765,1.8407
orthologs_myoglobin,0.0954,1.9198,0.9196,3.4647,1.7845,0.0,0.9044,0.12,1.7881,3.6828,...,5.1548,2.7135,0.5494,1.0809,4.7645,2.4729,0.1965,1.1184,4.5982,2.1081
orthologs_neuroglobin,1.2176,4.3852,1.8043,4.9502,4.3484,0.0,1.1514,1.1599,4.2828,4.6459,...,5.7259,5.5042,1.8501,2.1755,5.6379,4.9408,1.1943,2.4028,5.6275,4.4653
orthologs_cytoglobin,2.7743,8.2561,6.3417,8.3726,8.0178,0.0,3.1095,2.7064,8.1209,8.3101,...,9.6924,9.853,4.0279,4.1343,8.7963,9.0304,2.7103,4.5513,8.8261,8.0407
orthologs_androglobin,4.0221,10.0408,8.2855,8.6007,9.764,0.0,2.9601,4.0998,10.0046,9.7836,...,11.0382,,5.6724,5.8893,9.5528,,3.7766,6.3139,9.3038,


In [7]:
channels = ("full", "red_green", "red_blue", "green_blue")
datasets = ["indelible", "orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,full,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,Structural Similarity Index Measure,Control with Clustal Omega,Local with Smith–Waterman,red_green,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,red_blue,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,green_blue,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
indelible,19.488,4.9357,10.4094,6.3518,4.8671,0.0,8.72,23.6159,10.8325,6.4986,12.1031,23.6115,10.8349,8.0268,12.1008,23.5676,10.7897,5.2276,12.0657
orthologs_hemoglobin_beta,0.1542,1.7944,0.9085,3.4018,1.6405,0.0,0.8937,0.4849,0.8008,3.3919,0.7123,0.4957,0.7808,3.6864,0.6902,0.4511,0.8787,3.2673,0.7499
orthologs_myoglobin,0.0954,1.9198,0.9196,3.4647,1.7845,0.0,0.9044,0.451,0.8755,3.3211,0.7957,0.4347,0.8781,3.6256,0.7965,0.4279,0.9431,2.8999,0.8332
orthologs_neuroglobin,1.2176,4.3852,1.8043,4.9502,4.3484,0.0,1.1514,0.6618,2.4277,4.3246,2.4301,0.6385,2.4158,4.5724,2.3888,0.652,2.4714,4.4973,2.4261
orthologs_cytoglobin,2.7743,8.2561,6.3417,8.3726,8.0178,0.0,3.1095,1.5191,4.6484,7.6154,4.5236,1.5174,4.6571,8.1302,4.499,1.5304,4.718,7.9015,4.5461
orthologs_androglobin,4.0221,10.0408,8.2855,8.6007,9.764,0.0,2.9601,2.4228,6.3221,9.5862,7.6809,2.4578,6.298,8.8234,7.6852,2.4023,6.3283,10.3151,7.6536


In [8]:
def read_and_linear_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(spearmanr(control_df.values.flatten(), df.values.flatten())[0], 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Linear Correlation from distance matrices

In [9]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["indelible", "orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_linear_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,full,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,Structural Similarity Index Measure,Control with Clustal Omega,Local with Smith–Waterman,red,Universal Quality Index,Deep Search with Annoy,...,Deep Search with Annoy,Structural Similarity Index Measure,gray_max,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,gray_mean,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
indelible,0.1616,0.2561,0.2541,0.1143,0.1957,1.0,0.3428,0.1623,0.2746,0.0336,...,0.179,0.2174,0.2284,0.3654,0.1092,0.2483,0.2238,0.2992,0.121,0.2817
orthologs_hemoglobin_beta,0.9965,0.9982,0.9999,0.2451,0.9975,1.0,0.9998,0.9904,0.9965,0.195,...,0.2488,0.9821,0.9948,0.997,0.2774,0.9968,0.9962,0.9966,0.2594,0.9976
orthologs_myoglobin,0.9918,0.9973,0.9999,0.2297,0.9969,1.0,0.9998,0.9915,0.9959,0.2522,...,0.322,0.9688,0.9952,0.9977,0.2875,0.9981,0.993,0.9891,0.2833,0.9967
orthologs_neuroglobin,0.9861,0.9904,0.9999,0.1462,0.9888,1.0,0.9992,0.9827,0.9883,0.1475,...,0.1171,0.9927,0.9852,0.9804,0.1388,0.9796,0.9917,0.9777,0.1488,0.9943
orthologs_cytoglobin,0.3888,0.521,0.7775,0.2531,0.4716,1.0,0.8623,0.293,0.4978,0.2731,...,0.2597,0.4625,0.436,0.6277,0.2534,0.365,0.4386,0.6606,0.2703,0.3971
orthologs_androglobin,0.5718,0.6514,0.9054,0.2036,0.4814,1.0,0.9769,0.5633,0.6551,0.2166,...,0.1715,,0.6329,0.7081,0.2196,,0.394,0.7357,0.1967,


In [10]:
def read_and_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(ppscore.score(
                pandas.DataFrame({"x": control_df.values.flatten(), "y": df.values.flatten()}), "x", "y")["ppscore"], 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Predictive Power Score from distance matrices

In [11]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["indelible", "orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,full,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,Structural Similarity Index Measure,Control with Clustal Omega,Local with Smith–Waterman,red,Universal Quality Index,Deep Search with Annoy,...,Deep Search with Annoy,Structural Similarity Index Measure,gray_max,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,gray_mean,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
indelible,0.8666,0.9809,0.9039,0.5474,0.9868,0.9998,0.9779,0.8203,0.9783,0.5491,...,0.567,0.6654,0.8543,0.9692,0.5288,0.9852,0.5002,0.8278,0.5134,0.7179
orthologs_hemoglobin_beta,0.9658,0.9798,0.9953,0.4035,0.9776,0.9968,0.9944,0.9491,0.9693,0.395,...,0.2939,0.915,0.9586,0.9713,0.4367,0.9706,0.9672,0.9646,0.4078,0.9741
orthologs_myoglobin,0.9471,0.9697,0.9954,0.2873,0.9689,0.9968,0.9925,0.9457,0.9651,0.2766,...,0.3381,0.8541,0.9548,0.9742,0.2751,0.9722,0.9363,0.9312,0.2911,0.9563
orthologs_neuroglobin,0.9584,0.9673,0.9876,0.3161,0.9659,0.9964,0.9897,0.9562,0.9636,0.298,...,0.3245,0.9616,0.9555,0.9661,0.3295,0.9668,0.9599,0.9654,0.3321,0.9681
orthologs_cytoglobin,0.7789,0.7964,0.8524,0.6691,0.7943,0.9913,0.8584,0.7765,0.794,0.6707,...,0.7007,0.8,0.7697,0.8007,0.6833,0.7992,0.7621,0.7992,0.6882,0.7945
orthologs_androglobin,0.8638,0.8654,0.9773,0.5908,0.7885,0.9977,0.9799,0.8775,0.865,0.5741,...,0.6273,,0.8729,0.87,0.5983,,0.8746,0.8663,0.6001,


In [12]:
def read_and_tree_compare(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/Control with Clustal Omega.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            tree = Tree(f"{path}/{file}", format=1)
            result = control_tree.compare(tree, unrooted=True)
            result_dict[basename] = round(result["norm_rf"], 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Robinson-foulds distance from generated Trees

In [15]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["indelible", "orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,Deep Search with Annoy,Structural Similarity Index Measure,full,Control with Clustal Omega,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,red,...,gray_g,Universal Quality Index,Deep Search with Annoy,Structural Similarity Index Measure,gray_b,Universal Quality Index,Structural Similarity Index Measure,gray_max,Structural Similarity Index Measure,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
indelible,0.1622,0.1622,1.0,0.1892,0.1892,0.0,0.1892,1.0,0.1892,0.1892,...,0.1892,0.1892,0.973,0.1892,0.1892,0.1892,0.1622,0.1892,0.1892,0.1892
orthologs_hemoglobin_beta,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0833,0.0833,...,0.0,0.0,1.0,0.1667,0.0833,0.1667,0.0,0.0,0.0,0.0833
orthologs_myoglobin,0.0,0.0,0.9167,0.0833,0.0,0.0,0.0833,1.0,0.0833,0.0833,...,0.25,0.25,0.8333,0.1667,0.0833,0.1667,0.0833,0.0833,0.0833,0.0833
orthologs_neuroglobin,0.0,0.0,1.0,0.0833,0.1667,0.0,0.0833,1.0,0.1667,0.1667,...,0.25,0.0833,1.0,0.0833,0.1667,0.0833,0.0833,0.1667,0.0833,0.1667
orthologs_cytoglobin,0.6667,0.5,1.0,0.9167,0.9167,0.0,0.9167,1.0,0.9167,0.9167,...,0.9167,0.9167,1.0,0.9167,0.9167,0.9167,0.9167,0.9167,0.9167,0.9167
orthologs_androglobin,0.8333,0.5,1.0,1.0,0.9167,0.0,0.9167,1.0,,0.9167,...,0.9167,0.9167,1.0,,0.9167,0.9167,,0.9167,,0.9167


In [16]:
def read_and_tree_compare_branches(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/Control with Clustal Omega.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            tree = Tree(f"{path}/{file}", format=1)
            result = control_tree.compare(tree, unrooted=True)
            result_dict[basename] = 1.0 - round(result["source_edges_in_ref"], 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Compatibility branch score from generated Trees

In [17]:
channels = ("full",)#, "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["indelible", "orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare_branches(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates().set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,Deep Search with Annoy,Structural Similarity Index Measure,full,Control with Clustal Omega,Universal Quality Index
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
indelible,0.0789,0.0789,0.4868,0.0921,0.0921,0.0,0.0921
orthologs_hemoglobin_beta,0.0,0.0,0.4615,0.0,0.0,0.0,0.0
orthologs_myoglobin,0.0,0.0,0.4231,0.0385,0.0,0.0,0.0385
orthologs_neuroglobin,0.0,0.0,0.4615,0.0385,0.0769,0.0,0.0385
orthologs_cytoglobin,0.3077,0.2308,0.4615,0.4231,0.4231,0.0,0.4231
orthologs_androglobin,0.3846,0.2308,0.4615,0.4615,0.4231,0.0,0.4231


In [18]:
def read_and_compare_channels(dataset):
    result_dict = {}
    dfs = {}
    channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
    for channel in channels:
        result_dict[channel] = {}
        path = f"{tree_path}/{channel}/{dataset}"
        dfs[channel] = pandas.read_csv(f"{path}/MultiScale Structural Similarity Index Measure.csv", index_col=0)
    for channel in channels:
        control_df = dfs[channel]
        for c, df in dfs.items():
            result_dict[channel][c] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame.from_dict(result_dict, orient='index')
    result_df.index.name = "dataset"
    return result_df.style.background_gradient(axis=None, vmin=result_df.min().min(), vmax=result_df.max().max(), cmap="YlGnBu")

## Features Euclidean distance from distance matrices

In [19]:
for dataset in datasets:
    display(HTML(f"<center><h3>{dataset}</h3></center>"))
    display(read_and_compare_channels(dataset))

Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,10.7963,8.4147,8.4104,4.2321,4.2277,4.1827,0.0955,8.9084,7.3829,5.1708,0.6432
red,10.7963,0.0,2.3816,2.3859,6.5642,6.5686,6.6136,10.7021,19.6403,18.0624,15.9642,10.9874
green,8.4147,2.3816,0.0,0.0122,4.1827,4.187,4.2321,8.3206,17.2659,15.693,13.583,8.6107
blue,8.4104,2.3859,0.0122,0.0,4.1783,4.1827,4.2277,8.3163,17.2618,15.6888,13.5786,8.6064
red_green,4.2321,6.5642,4.1827,4.1783,0.0,0.0122,0.0499,4.1379,13.1025,11.5433,9.401,4.4491
red_blue,4.2277,6.5686,4.187,4.1827,0.0122,0.0,0.0466,4.1336,13.0984,11.5391,9.3966,4.4447
green_blue,4.1827,6.6136,4.2321,4.2277,0.0499,0.0466,0.0,4.0886,13.0535,11.4946,9.3516,4.4001
gray_r,0.0955,10.7021,8.3206,8.3163,4.1379,4.1336,4.0886,0.0,9.0014,7.4742,5.2649,0.6754
gray_g,8.9084,19.6403,17.2659,17.2618,13.1025,13.0984,13.0535,9.0014,0.0,2.5451,3.9033,8.7833
gray_b,7.3829,18.0624,15.693,15.6888,11.5433,11.5391,11.4946,7.4742,2.5451,0.0,2.6544,7.1514


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,0.8345,0.6469,0.6569,0.3408,0.3508,0.3064,0.0848,1.1911,0.9544,0.5198,0.1236
red,0.8345,0.0,0.1881,0.178,0.4939,0.4838,0.5285,0.7559,2.0245,1.7791,1.3532,0.9545
green,0.6469,0.1881,0.0,0.0188,0.3064,0.2968,0.3408,0.5691,1.8367,1.5927,1.1656,0.767
blue,0.6569,0.178,0.0188,0.0,0.3168,0.3064,0.3508,0.5791,1.8472,1.6019,1.1757,0.7771
red_green,0.3408,0.4939,0.3064,0.3168,0.0,0.0188,0.0384,0.2631,1.531,1.2903,0.8598,0.4613
red_blue,0.3508,0.4838,0.2968,0.3064,0.0188,0.0,0.0479,0.273,1.5416,1.2993,0.8698,0.4714
green_blue,0.3064,0.5285,0.3408,0.3508,0.0384,0.0479,0.0,0.2309,1.4966,1.2546,0.8253,0.4269
gray_r,0.0848,0.7559,0.5691,0.5791,0.2631,0.273,0.2309,0.0,1.2708,1.0351,0.5995,0.204
gray_g,1.1911,2.0245,1.8367,1.8472,1.531,1.5416,1.4966,1.2708,0.0,0.3399,0.6755,1.0723
gray_b,0.9544,1.7791,1.5927,1.6019,1.2903,1.2993,1.2546,1.0351,0.3399,0.0,0.4597,0.8348


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,0.8932,0.7197,0.7035,0.3718,0.3558,0.348,0.0417,1.0881,1.1767,0.6229,0.2597
red,0.8932,0.0,0.1747,0.1901,0.5216,0.5377,0.5455,0.862,1.9738,2.0636,1.514,1.1454
green,0.7197,0.1747,0.0,0.0285,0.348,0.3653,0.3718,0.6893,1.7995,1.8906,1.3406,0.9724
blue,0.7035,0.1901,0.0285,0.0,0.3323,0.348,0.3558,0.6726,1.7851,1.8744,1.3247,0.9564
red_green,0.3718,0.5216,0.348,0.3323,0.0,0.0285,0.0267,0.3417,1.4547,1.5451,0.9932,0.6257
red_blue,0.3558,0.5377,0.3653,0.348,0.0285,0.0,0.0237,0.3247,1.4409,1.529,0.9775,0.6101
green_blue,0.348,0.5455,0.3718,0.3558,0.0267,0.0237,0.0,0.3186,1.4308,1.5209,0.9696,0.6024
gray_r,0.0417,0.862,0.6893,0.6726,0.3417,0.3247,0.3186,0.0,1.1229,1.2092,0.6547,0.2908
gray_g,1.0881,1.9738,1.7995,1.7851,1.4547,1.4409,1.4308,1.1229,0.0,0.2725,0.4946,0.8469
gray_b,1.1767,2.0636,1.8906,1.8744,1.5451,1.529,1.5209,1.2092,0.2725,0.0,0.5724,0.9279


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,2.223,1.7258,1.769,0.8719,0.915,0.854,0.0833,2.5,2.2402,0.7488,0.1967
red,2.223,0.0,0.4974,0.4543,1.3513,1.3081,1.3691,2.144,4.7203,4.4479,2.9423,2.2875
green,1.7258,0.4974,0.0,0.0476,0.854,0.8109,0.8719,1.6468,4.2231,3.9531,2.4476,1.7933
blue,1.769,0.4543,0.0476,0.0,0.8975,0.854,0.915,1.6901,4.2668,3.9948,2.4899,1.8347
red_green,0.8719,1.3513,0.854,0.8975,0.0,0.0476,0.027,0.7928,3.3699,3.1046,1.6007,0.9499
red_blue,0.915,1.3081,0.8109,0.854,0.0476,0.0,0.062,0.8363,3.4137,3.1458,1.6422,0.9893
green_blue,0.854,1.3691,0.8719,0.915,0.027,0.062,0.0,0.7755,3.3526,3.0852,1.5814,0.9292
gray_r,0.0833,2.144,1.6468,1.6901,0.7928,0.8363,0.7755,0.0,2.5787,2.3206,0.8285,0.2469
gray_g,2.5,4.7203,4.2231,4.2668,3.3699,3.4137,3.3526,2.5787,0.0,0.5791,1.8037,2.4575
gray_b,2.2402,4.4479,3.9531,3.9948,3.1046,3.1458,3.0852,2.3206,0.5791,0.0,1.5488,2.1646


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,4.4263,3.5267,3.5294,1.7849,1.7875,1.7421,0.1204,4.9178,3.411,1.3781,0.1845
red,4.4263,0.0,0.9004,0.8981,2.6415,2.639,2.6848,4.338,9.3398,7.828,5.8013,4.3597
green,3.5267,0.9004,0.0,0.0217,1.7421,1.7395,1.7849,3.4392,8.4406,6.9287,4.9017,3.4608
blue,3.5294,0.8981,0.0217,0.0,1.745,1.7421,1.7875,3.442,8.4431,6.9313,4.9043,3.4638
red_green,1.7849,2.6415,1.7421,1.745,0.0,0.0217,0.0616,1.6974,6.7002,5.1902,3.1607,1.7235
red_blue,1.7875,2.639,1.7395,1.7421,0.0217,0.0,0.0605,1.7002,6.7028,5.1929,3.1631,1.7266
green_blue,1.7421,2.6848,1.7849,1.7875,0.0616,0.0605,0.0,1.6568,6.6571,5.1465,3.1176,1.6805
gray_r,0.1204,4.338,3.4392,3.442,1.6974,1.7002,1.6568,0.0,5.0075,3.5032,1.4698,0.2014
gray_g,4.9178,9.3398,8.4406,8.4431,6.7002,6.7028,6.6571,5.0075,0.0,1.5961,3.5481,4.9938
gray_b,3.411,7.828,6.9287,6.9313,5.1902,5.1929,5.1465,3.5032,1.5961,0.0,2.0478,3.4716


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,4.362,3.3489,3.3153,1.663,1.6295,1.6859,0.1099,1.6065,0.8187,1.6781,0.3301
red,4.362,0.0,1.016,1.0499,2.6995,2.7333,2.6767,4.4403,5.9096,4.8742,6.0379,4.1036
green,3.3489,1.016,0.0,0.0359,1.6859,1.7195,1.663,3.4286,4.9028,3.8774,5.0256,3.0935
blue,3.3153,1.0499,0.0359,0.0,1.6523,1.6859,1.6295,3.395,4.87,3.846,4.992,3.0601
red_green,1.663,2.6995,1.6859,1.6523,0.0,0.0359,0.0262,1.7435,3.2296,2.2363,3.3401,1.4154
red_blue,1.6295,2.7333,1.7195,1.6859,0.0359,0.0,0.0593,1.7099,3.1973,2.2073,3.3065,1.3824
green_blue,1.6859,2.6767,1.663,1.6295,0.0262,0.0593,0.0,1.7665,3.2528,2.2574,3.3629,1.4382
gray_r,0.1099,4.4403,3.4286,3.395,1.7435,1.7099,1.7665,0.0,1.5297,0.7855,1.5985,0.3908
gray_g,1.6065,5.9096,4.9028,4.87,3.2296,3.1973,3.2528,1.5297,0.0,1.29,0.509,1.8611
gray_b,0.8187,4.8742,3.8774,3.846,2.2363,2.2073,2.2574,0.7855,1.29,0.0,1.3848,0.9841
