# Comparing channels

- R: sequential
- G: complementary
- B: not match at all

In [17]:
import ppscore
import pandas
import os
import numpy
from scipy.stats import spearmanr
from IPython.core.display import display, HTML
from ete3 import Tree

  from IPython.core.display import display, HTML


In [18]:
tree_path = "../../../data/trees"
channels = ("full", "gray_r", "gray_g", "gray_b", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin"]

In [19]:
def read_and_compare(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Euclidean Distance from distance matrices

In [20]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.9085,1.8686,0.0,0.8937,1.8103,1.8433,1.9548,1.1396
orthologs_myoglobin,0.9196,1.9899,0.0,0.9044,1.8112,1.9487,2.214,1.152
orthologs_neuroglobin,1.8043,5.8999,0.0,1.1514,5.8504,5.8717,5.9314,1.7182
orthologs_cytoglobin,6.3417,6.5979,0.0,3.1095,6.6067,6.6341,6.5003,3.2222


In [21]:
def read_and_linear_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(spearmanr(control_df.values.flatten(), df.values.flatten())[0], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Linear Correlation from distance matrices

In [22]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_linear_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T



index,Global with Needleman-Wunsch,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.9999,0.9978,1.0,0.9998,0.9976,0.9984,0.9919,
orthologs_myoglobin,0.9999,0.9972,1.0,0.9998,0.995,0.9977,0.9922,
orthologs_neuroglobin,0.9999,0.9858,1.0,0.9992,0.9868,0.9935,0.9802,
orthologs_cytoglobin,0.7775,0.4569,1.0,0.8623,0.4521,0.4544,0.4514,


In [23]:
def read_and_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(ppscore.score(
                    pandas.DataFrame({"x": control_df.values.flatten(), "y": df.values.flatten()}), "x", "y")["ppscore"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Predictive Power Score from distance matrices

In [24]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.9953,0.9792,0.9968,0.9944,0.9801,0.9824,0.9484,0.0
orthologs_myoglobin,0.9954,0.9699,0.9968,0.9925,0.971,0.9715,0.921,0.0
orthologs_neuroglobin,0.9876,0.9681,0.9964,0.9897,0.9649,0.9659,0.9622,0.0
orthologs_cytoglobin,0.8524,0.7308,0.9913,0.8584,0.743,0.7417,0.7133,0.0


In [25]:
def read_and_tree_compare(dataset, channel="full", control="Control with Clustal Omega"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/{control}.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                tree = Tree(f"{path}/{file}", format=1)
                result = control_tree.compare(tree, unrooted=True)
                result_dict[basename] = round(result["norm_rf"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Robinson-foulds distance from generated Trees

In [26]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.1667,1.0
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0833,0.0833,0.1667,1.0
orthologs_neuroglobin,0.0,0.0,0.0833,0.0,0.0833,0.0833,0.0833,1.0
orthologs_cytoglobin,0.6667,0.5,0.5833,0.0,0.5833,0.5833,0.5833,1.0


In [27]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Global with Needleman-Wunsch"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.1667,1.0
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0833,0.0833,0.1667,1.0
orthologs_neuroglobin,0.0,0.0,0.0833,0.0,0.0833,0.0833,0.0833,1.0
orthologs_cytoglobin,0.0,0.25,0.6667,0.6667,0.6667,0.6667,0.6667,1.0


In [28]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Local with Smith–Waterman"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.1667,1.0
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0833,0.0833,0.1667,1.0
orthologs_neuroglobin,0.0,0.0,0.0833,0.0,0.0833,0.0833,0.0833,1.0
orthologs_cytoglobin,0.25,0.0,0.5,0.5,0.5,0.5,0.5,1.0


In [29]:
def read_and_tree_compare_branches(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/Control with Clustal Omega.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                tree = Tree(f"{path}/{file}", format=1)
                result = control_tree.compare(tree, unrooted=True)
                result_dict[basename] = 1.0 - round(result["source_edges_in_ref"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Compatibility branch score from generated Trees

In [30]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare_branches(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.0769,0.4615
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0385,0.0385,0.0769,0.4615
orthologs_neuroglobin,0.0,0.0,0.0385,0.0,0.0385,0.0385,0.0385,0.4615
orthologs_cytoglobin,0.3077,0.2308,0.2692,0.0,0.2692,0.2692,0.2692,0.4615


In [31]:
def read_and_compare_channels(dataset):
    result_dict = {}
    dfs = {}
    for channel in channels:
        result_dict[channel] = {}
        path = f"{tree_path}/{channel}/{dataset}"
        dfs[channel] = pandas.read_csv(f"{path}/Structural Similarity Index Measure.csv", index_col=0)
    for channel in channels:
        control_df = dfs[channel]
        for c, df in dfs.items():
            result_dict[channel][c] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame.from_dict(result_dict, orient='index')
    result_df.index.name = "dataset"
    return result_df.style.background_gradient(axis=None, vmin=result_df.min().min(), vmax=result_df.max().max(), cmap="YlGnBu")

## Features Euclidean distance from distance matrices

In [32]:
for dataset in datasets:
    try:
        correlations = read_and_compare_channels(dataset)
    except FileNotFoundError:
        continue
    else:
        display(HTML(f"<center><h3>{dataset}</h3></center>"))
        display(correlations)        

Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.0928,0.0756,0.1645,3.006
gray_r,0.0928,0.0,0.0426,0.2556,2.9485
gray_g,0.0756,0.0426,0.0,0.2386,2.9817
gray_b,0.1645,0.2556,0.2386,0.0,3.0871
gray_mean,3.006,2.9485,2.9817,3.0871,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.2084,0.0858,0.2587,3.1404
gray_r,0.2084,0.0,0.1797,0.4627,2.9616
gray_g,0.0858,0.1797,0.0,0.3267,3.0984
gray_b,0.2587,0.4627,0.3267,0.0,3.3622
gray_mean,3.1404,2.9616,3.0984,3.3622,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.1679,0.1371,0.3001,7.4594
gray_r,0.1679,0.0,0.0412,0.4646,7.4035
gray_g,0.1371,0.0412,0.0,0.4359,7.427
gray_b,0.3001,0.4646,0.4359,0.0,7.5
gray_mean,7.4594,7.4035,7.427,7.5,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.2487,0.2456,0.3063,9.3722
gray_r,0.2487,0.0,0.0412,0.4417,9.3779
gray_g,0.2456,0.0412,0.0,0.4422,9.4054
gray_b,0.3063,0.4417,0.4422,0.0,9.2761
gray_mean,9.3722,9.3779,9.4054,9.2761,0.0
