# Comparing channels

- R: sequential
- G: complementary
- B: not match at all

In [33]:
import ppscore
import pandas
import os
import numpy
from scipy.stats import spearmanr
from IPython.core.display import display, HTML
from ete3 import Tree

  from IPython.core.display import display, HTML


In [34]:
tree_path = "../../../data/trees"
channels = ("full", "gray_r", "gray_g", "gray_b", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin"]

In [35]:
def read_and_compare(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Euclidean Distance from distance matrices

In [36]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,MultiScale Structural Similarity Index Measure,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
orthologs_hemoglobin_beta,1.5868,0.9008,0.9085,4.8645,1.9025,0.0,0.8937,1.8161,1.8564,2.0453,1.1396
orthologs_myoglobin,2.0903,0.9482,0.9196,4.809,2.0037,0.0,0.9044,1.832,1.9685,2.2213,1.152
orthologs_neuroglobin,5.5957,3.1412,1.8043,6.0855,5.9109,0.0,1.1514,5.8736,5.9064,5.9644,1.7182
orthologs_cytoglobin,10.5236,6.0878,6.3417,10.1226,11.117,0.0,3.1095,11.0819,11.1145,11.1551,3.2222


In [37]:
def read_and_linear_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(spearmanr(control_df.values.flatten(), df.values.flatten())[0], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Linear Correlation from distance matrices

In [38]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_linear_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T



index,MultiScale Structural Similarity Index Measure,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
orthologs_hemoglobin_beta,0.9844,0.9984,0.9999,0.2257,0.9981,1.0,0.9998,0.9977,0.9987,0.9928,
orthologs_myoglobin,0.9688,0.9988,0.9999,0.3322,0.9985,1.0,0.9998,0.9924,0.9985,0.9923,
orthologs_neuroglobin,0.9747,0.9966,0.9999,0.1127,0.99,1.0,0.9992,0.98,0.989,0.9891,
orthologs_cytoglobin,0.2268,0.4723,0.7775,0.2521,0.1597,1.0,0.8623,0.1901,0.1764,0.1726,


In [39]:
def read_and_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(ppscore.score(
                    pandas.DataFrame({"x": control_df.values.flatten(), "y": df.values.flatten()}), "x", "y")["ppscore"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Predictive Power Score from distance matrices

In [40]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,MultiScale Structural Similarity Index Measure,Universal Quality Index,Global with Needleman-Wunsch,Deep Search with Annoy,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
orthologs_hemoglobin_beta,0.9216,0.9833,0.9953,0.431,0.9794,0.9968,0.9944,0.9836,0.9848,0.9501,0.0
orthologs_myoglobin,0.9265,0.982,0.9954,0.351,0.9784,0.9968,0.9925,0.9648,0.9729,0.9306,0.0
orthologs_neuroglobin,0.9573,0.9701,0.9876,0.293,0.969,0.9964,0.9897,0.9661,0.9672,0.9618,0.0
orthologs_cytoglobin,0.7672,0.7955,0.8524,0.6626,0.797,0.9913,0.8584,0.7964,0.7974,0.7962,0.0


In [41]:
def read_and_tree_compare(dataset, channel="full", control="Control with Clustal Omega"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/{control}.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                tree = Tree(f"{path}/{file}", format=1)
                result = control_tree.compare(tree, unrooted=True)
                result_dict[basename] = round(result["norm_rf"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Robinson-foulds distance from generated Trees

In [42]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,Deep Search with Annoy,full,MultiScale Structural Similarity Index Measure,Control with Clustal Omega,Universal Quality Index,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
orthologs_hemoglobin_beta,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1667,1.0
orthologs_myoglobin,0.0,0.0,0.9167,0.0,0.0,0.0,0.0,0.0833,0.0833,0.1667,1.0
orthologs_neuroglobin,0.0,0.0,0.9167,0.0833,0.1667,0.0,0.0833,0.0833,0.0833,0.0833,1.0
orthologs_cytoglobin,0.6667,0.5,1.0,0.9167,0.9167,0.0,0.9167,0.9167,0.9167,0.8333,1.0


In [43]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Global with Needleman-Wunsch"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,Deep Search with Annoy,full,MultiScale Structural Similarity Index Measure,Control with Clustal Omega,Universal Quality Index,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
orthologs_hemoglobin_beta,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1667,1.0
orthologs_myoglobin,0.0,0.0,0.9167,0.0,0.0,0.0,0.0,0.0833,0.0833,0.1667,1.0
orthologs_neuroglobin,0.0,0.0,0.9167,0.0833,0.1667,0.0,0.0833,0.0833,0.0833,0.0833,1.0
orthologs_cytoglobin,0.0,0.25,1.0,0.6667,0.6667,0.6667,0.5,0.6667,0.6667,0.6667,1.0


In [44]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Local with Smith–Waterman"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,Deep Search with Annoy,full,MultiScale Structural Similarity Index Measure,Control with Clustal Omega,Universal Quality Index,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
orthologs_hemoglobin_beta,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1667,1.0
orthologs_myoglobin,0.0,0.0,0.9167,0.0,0.0,0.0,0.0,0.0833,0.0833,0.1667,1.0
orthologs_neuroglobin,0.0,0.0,0.9167,0.0833,0.1667,0.0,0.0833,0.0833,0.0833,0.0833,1.0
orthologs_cytoglobin,0.25,0.0,1.0,0.75,0.75,0.5,0.6667,0.75,0.75,0.6667,1.0


In [45]:
def read_and_tree_compare_branches(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/Control with Clustal Omega.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                tree = Tree(f"{path}/{file}", format=1)
                result = control_tree.compare(tree, unrooted=True)
                result_dict[basename] = 1.0 - round(result["source_edges_in_ref"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Compatibility branch score from generated Trees

In [46]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare_branches(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,Deep Search with Annoy,full,MultiScale Structural Similarity Index Measure,Control with Clustal Omega,Universal Quality Index,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.4615,0.0,0.0,0.0,0.0,0.0,0.0,0.0769,0.4615
orthologs_myoglobin,0.0,0.0,0.4231,0.0,0.0,0.0,0.0,0.0385,0.0385,0.0769,0.4615
orthologs_neuroglobin,0.0,0.0,0.4231,0.0385,0.0769,0.0,0.0385,0.0385,0.0385,0.0385,0.4615
orthologs_cytoglobin,0.3077,0.2308,0.4615,0.4231,0.4231,0.0,0.4231,0.4231,0.4231,0.3846,0.4615


In [47]:
def read_and_compare_channels(dataset):
    result_dict = {}
    dfs = {}
    for channel in channels:
        result_dict[channel] = {}
        path = f"{tree_path}/{channel}/{dataset}"
        dfs[channel] = pandas.read_csv(f"{path}/Structural Similarity Index Measure.csv", index_col=0)
    for channel in channels:
        control_df = dfs[channel]
        for c, df in dfs.items():
            result_dict[channel][c] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame.from_dict(result_dict, orient='index')
    result_df.index.name = "dataset"
    return result_df.style.background_gradient(axis=None, vmin=result_df.min().min(), vmax=result_df.max().max(), cmap="YlGnBu")

## Features Euclidean distance from distance matrices

In [48]:
for dataset in datasets:
    try:
        correlations = read_and_compare_channels(dataset)
    except FileNotFoundError:
        continue
    else:
        display(HTML(f"<center><h3>{dataset}</h3></center>"))
        display(correlations)        

Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.1121,0.0873,0.1973,3.0392
gray_r,0.1121,0.0,0.0486,0.3082,2.9542
gray_g,0.0873,0.0486,0.0,0.2828,2.9946
gray_b,0.1973,0.3082,0.2828,0.0,3.1765
gray_mean,3.0392,2.9542,2.9946,3.1765,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.2009,0.0815,0.2532,3.1545
gray_r,0.2009,0.0,0.1759,0.4497,2.9823
gray_g,0.0815,0.1759,0.0,0.3166,3.1187
gray_b,0.2532,0.4497,0.3166,0.0,3.3704
gray_mean,3.1545,2.9823,3.1187,3.3704,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.1634,0.1377,0.3012,7.4716
gray_r,0.1634,0.0,0.0445,0.4639,7.4271
gray_g,0.1377,0.0445,0.0,0.4381,7.4617
gray_b,0.3012,0.4639,0.4381,0.0,7.5352
gray_mean,7.4716,7.4271,7.4617,7.5352,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.0548,0.0398,0.0829,13.9182
gray_r,0.0548,0.0,0.0486,0.1347,13.8826
gray_g,0.0398,0.0486,0.0,0.1178,13.9153
gray_b,0.0829,0.1347,0.1178,0.0,13.957
gray_mean,13.9182,13.8826,13.9153,13.957,0.0
