In [70]:
import pandas
import os
import numpy
from minepy import MINE
from IPython.core.display import display, HTML

In [121]:
def read_and_correlate(dataset, channel="full"):
    corr = MINE()
    result_dict = {}
    path = f"../data/trees/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            corr.compute_score(control_df.values.flatten(), df.values.flatten())
            result_dict[basename] = round(1 - corr.mic(), 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

In [122]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.drop_duplicates().T

Unnamed: 0_level_0,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
orthologs_neuroglobin,0.0004,0.0004,0.0004,0.0007,0.0007,0.0007,0.0004,0.0116,0.0004,0.0004,0.0007,0.0004,0.0004,0.0007,0.0004
orthologs_cytoglobin,0.0,0.0834,0.1259,0.4456,0.4088,0.332,0.4208,0.3754,0.4484,0.431,0.4088,0.4223,0.5115,0.4526,0.497
orthologs_androglobin,0.0,0.0537,0.0,0.2312,0.3447,0.3843,0.3074,0.322,0.3203,0.351,0.3003,0.1864,0.3457,0.4067,0.2662


In [34]:
def read_and_compare(dataset, channel="full"):
    result_dict = {}
    path = f"../data/trees/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            df = pandas.read_csv(f"{path}/{file}", index_col=0)
            result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

In [36]:
channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.drop_duplicates().T

Unnamed: 0_level_0,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,0.1542,0.8342,0.7896,0.8,0.4849,0.4957,0.4511,0.2308,1.0531,0.8252,0.3888,0.0754
orthologs_myoglobin,0.0,0.9196,0.9044,0.0954,0.8046,0.7975,0.7811,0.451,0.4347,0.4279,0.12,1.0186,1.106,0.5494,0.1964
orthologs_neuroglobin,0.0,1.8043,1.1514,1.2176,0.9789,0.9323,0.9554,0.6618,0.6385,0.652,1.1599,3.6185,3.2697,1.8501,1.1943
orthologs_cytoglobin,0.0,6.3417,3.1095,2.7743,1.8149,1.7804,1.7816,1.5191,1.5174,1.5304,2.7064,7.4543,5.9601,4.0279,2.7103
orthologs_androglobin,0.0,8.2855,2.9601,4.0221,1.0736,1.0347,1.0623,2.4228,2.4578,2.4023,4.1007,5.5073,4.4642,5.6728,3.7744


In [37]:
channels = ("full", "red", "green", "blue")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.drop_duplicates().T

Unnamed: 0_level_0,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red,green,blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,0.1542,0.8342,0.7896,0.8
orthologs_myoglobin,0.0,0.9196,0.9044,0.0954,0.8046,0.7975,0.7811
orthologs_neuroglobin,0.0,1.8043,1.1514,1.2176,0.9789,0.9323,0.9554
orthologs_cytoglobin,0.0,6.3417,3.1095,2.7743,1.8149,1.7804,1.7816
orthologs_androglobin,0.0,8.2855,2.9601,4.0221,1.0736,1.0347,1.0623


In [38]:
channels = ("full", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.drop_duplicates().T

Unnamed: 0_level_0,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,0.1542,0.2308,1.0531,0.8252,0.3888,0.0754
orthologs_myoglobin,0.0,0.9196,0.9044,0.0954,0.12,1.0186,1.106,0.5494,0.1964
orthologs_neuroglobin,0.0,1.8043,1.1514,1.2176,1.1599,3.6185,3.2697,1.8501,1.1943
orthologs_cytoglobin,0.0,6.3417,3.1095,2.7743,2.7064,7.4543,5.9601,4.0279,2.7103
orthologs_androglobin,0.0,8.2855,2.9601,4.0221,4.1007,5.5073,4.4642,5.6728,3.7744


In [39]:
channels = ("full", "red_green", "red_blue", "green_blue")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin"]
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.drop_duplicates().T

Unnamed: 0_level_0,Control with Clustal Omega,Global with Needleman-Wunsch,Local with Smith–Waterman,full,red_green,red_blue,green_blue
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
orthologs_hemoglobin_beta,0.0,0.9085,0.8937,0.1542,0.4849,0.4957,0.4511
orthologs_myoglobin,0.0,0.9196,0.9044,0.0954,0.451,0.4347,0.4279
orthologs_neuroglobin,0.0,1.8043,1.1514,1.2176,0.6618,0.6385,0.652
orthologs_cytoglobin,0.0,6.3417,3.1095,2.7743,1.5191,1.5174,1.5304
orthologs_androglobin,0.0,8.2855,2.9601,4.0221,2.4228,2.4578,2.4023


In [189]:
def read_and_compare_channels(dataset):
    result_dict = {}
    dfs = {}
    channels = ("full", "red", "green", "blue", "red_green", "red_blue", "green_blue", "gray_r", "gray_g", "gray_b", "gray_max", "gray_mean")
    for channel in channels:
        result_dict[channel] = {}
        path = f"../data/trees/{channel}/{dataset}"
        dfs[channel] = pandas.read_csv(f"{path}/MultiScale Structural Similarity Index Measure.csv", index_col=0)
    for channel in channels:
        control_df = dfs[channel]
        for c, df in dfs.items():
            result_dict[channel][c] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame.from_dict(result_dict, orient='index')
    result_df.index.name = "dataset"
    return result_df.style.background_gradient(axis=None, vmin=result_df.min().min(), vmax=result_df.max().max(), cmap="YlGnBu")

In [190]:
for dataset in datasets:
    display(HTML(f"<center><h3>{dataset}</h3></center>"))
    display(read_and_compare_channels(dataset))

Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,0.6914,0.6469,0.6569,0.3408,0.3508,0.3064,0.0848,1.1911,0.9543,0.5197,0.1235
red,0.6914,0.0,0.0479,0.0384,0.3508,0.3408,0.3857,0.6127,1.8816,1.6373,1.2101,0.8115
green,0.6469,0.0479,0.0,0.0188,0.3064,0.2968,0.3408,0.569,1.8367,1.5926,1.1656,0.767
blue,0.6569,0.0384,0.0188,0.0,0.3168,0.3064,0.3508,0.579,1.8472,1.6018,1.1757,0.777
red_green,0.3408,0.3508,0.3064,0.3168,0.0,0.0188,0.0384,0.2631,1.5309,1.2903,0.8597,0.4613
red_blue,0.3508,0.3408,0.2968,0.3064,0.0188,0.0,0.0479,0.2729,1.5415,1.2992,0.8698,0.4714
green_blue,0.3064,0.3857,0.3408,0.3508,0.0384,0.0479,0.0,0.2309,1.4966,1.2546,0.8253,0.4268
gray_r,0.0848,0.6127,0.569,0.579,0.2631,0.2729,0.2309,0.0,1.2708,1.0351,0.5995,0.204
gray_g,1.1911,1.8816,1.8367,1.8472,1.5309,1.5415,1.4966,1.2708,0.0,0.3399,0.6755,1.0724
gray_b,0.9543,1.6373,1.5926,1.6018,1.2903,1.2992,1.2546,1.0351,0.3399,0.0,0.4597,0.8348


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,0.7273,0.7197,0.7035,0.3718,0.3558,0.348,0.0418,1.088,1.1766,0.6229,0.2597
red,0.7273,0.0,0.0237,0.0267,0.3558,0.3718,0.3798,0.696,1.8089,1.8984,1.3483,0.9799
green,0.7197,0.0237,0.0,0.0285,0.348,0.3653,0.3718,0.6893,1.7994,1.8905,1.3406,0.9724
blue,0.7035,0.0267,0.0285,0.0,0.3323,0.348,0.3558,0.6725,1.785,1.8744,1.3247,0.9564
red_green,0.3718,0.3558,0.348,0.3323,0.0,0.0285,0.0267,0.3416,1.4546,1.545,0.9932,0.6257
red_blue,0.3558,0.3718,0.3653,0.348,0.0285,0.0,0.0237,0.3246,1.4408,1.5289,0.9774,0.6101
green_blue,0.348,0.3798,0.3718,0.3558,0.0267,0.0237,0.0,0.3185,1.4307,1.5208,0.9696,0.6023
gray_r,0.0418,0.696,0.6893,0.6725,0.3416,0.3246,0.3185,0.0,1.1229,1.2092,0.6548,0.2909
gray_g,1.088,1.8089,1.7994,1.785,1.4546,1.4408,1.4307,1.1229,0.0,0.2725,0.4945,0.8468
gray_b,1.1766,1.8984,1.8905,1.8744,1.545,1.5289,1.5208,1.2092,0.2725,0.0,0.5723,0.9278


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,1.7868,1.7258,1.769,0.8719,0.915,0.854,0.0833,2.4999,2.2401,0.7488,0.1966
red,1.7868,0.0,0.062,0.027,0.915,0.8719,0.933,1.7077,4.2842,4.0139,2.5086,1.8541
green,1.7258,0.062,0.0,0.0476,0.854,0.8109,0.8719,1.6468,4.2231,3.9531,2.4476,1.7933
blue,1.769,0.027,0.0476,0.0,0.8975,0.854,0.915,1.6901,4.2668,3.9948,2.4899,1.8347
red_green,0.8719,0.915,0.854,0.8975,0.0,0.0476,0.027,0.7928,3.3699,3.1046,1.6006,0.9499
red_blue,0.915,0.8719,0.8109,0.854,0.0476,0.0,0.062,0.8363,3.4137,3.1458,1.6422,0.9893
green_blue,0.854,0.933,0.8719,0.915,0.027,0.062,0.0,0.7755,3.3525,3.0852,1.5814,0.9292
gray_r,0.0833,1.7077,1.6468,1.6901,0.7928,0.8363,0.7755,0.0,2.5787,2.3206,0.8286,0.2469
gray_g,2.4999,4.2842,4.2231,4.2668,3.3699,3.4137,3.3525,2.5787,0.0,0.5791,1.8037,2.4576
gray_b,2.2401,4.0139,3.9531,3.9948,3.1046,3.1458,3.0852,2.3206,0.5791,0.0,1.5488,2.1646


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,3.5723,3.5267,3.5294,1.7849,1.7875,1.7421,0.1204,4.9177,3.411,1.3781,0.1845
red,3.5723,0.0,0.0605,0.0616,1.7875,1.7849,1.8311,3.4839,8.4862,6.9749,4.9474,3.5068
green,3.5267,0.0605,0.0,0.0217,1.7421,1.7395,1.7849,3.4392,8.4406,6.9287,4.9017,3.4608
blue,3.5294,0.0616,0.0217,0.0,1.745,1.7421,1.7875,3.442,8.4431,6.9313,4.9043,3.4638
red_green,1.7849,1.7875,1.7421,1.745,0.0,0.0217,0.0616,1.6974,6.7002,5.1902,3.1607,1.7234
red_blue,1.7875,1.7849,1.7395,1.7421,0.0217,0.0,0.0605,1.7002,6.7028,5.1929,3.1631,1.7266
green_blue,1.7421,1.8311,1.7849,1.7875,0.0616,0.0605,0.0,1.6568,6.6571,5.1465,3.1176,1.6805
gray_r,0.1204,3.4839,3.4392,3.442,1.6974,1.7002,1.6568,0.0,5.0075,3.5032,1.4698,0.2013
gray_g,4.9177,8.4862,8.4406,8.4431,6.7002,6.7028,6.6571,5.0075,0.0,1.5961,3.5481,4.9938
gray_b,3.411,6.9749,6.9287,6.9313,5.1902,5.1929,5.1465,3.5032,1.5961,0.0,2.0478,3.4716


Unnamed: 0_level_0,full,red,green,blue,red_green,red_blue,green_blue,gray_r,gray_g,gray_b,gray_max,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
full,0.0,3.2925,3.3489,3.3153,1.663,1.6295,1.6859,0.0843,1.6081,0.8165,1.6769,0.3265
red,3.2925,0.0,0.0593,0.0262,1.6295,1.663,1.6067,3.3717,4.8498,3.8241,4.9689,3.0332
green,3.3489,0.0593,0.0,0.0359,1.6859,1.7195,1.663,3.4282,4.9058,3.8775,5.0253,3.0895
blue,3.3153,0.0262,0.0359,0.0,1.6523,1.6859,1.6295,3.3946,4.873,3.8461,4.9917,3.0561
red_green,1.663,1.6295,1.6859,1.6523,0.0,0.0359,0.0262,1.7424,3.2322,2.2361,3.3396,1.4106
red_blue,1.6295,1.663,1.7195,1.6859,0.0359,0.0,0.0593,1.7087,3.2,2.207,3.306,1.3776
green_blue,1.6859,1.6067,1.663,1.6295,0.0262,0.0593,0.0,1.7654,3.2554,2.2572,3.3624,1.4334
gray_r,0.0843,3.3717,3.4282,3.3946,1.7424,1.7087,1.7654,0.0,1.5324,0.7865,1.5981,0.3945
gray_g,1.6081,4.8498,4.9058,4.873,3.2322,3.2,3.2554,1.5324,0.0,1.2922,0.506,1.8678
gray_b,0.8165,3.8241,3.8775,3.8461,2.2361,2.207,2.2572,0.7865,1.2922,0.0,1.3844,0.989
