# Comparing channels

- R: sequential
- G: reverse complementary
- B: not match at all

In [1]:
import ppscore
import pandas
import os
import numpy
from scipy.stats import spearmanr
from IPython.core.display import display, HTML
from ete3 import Tree

  from IPython.core.display import display, HTML


In [9]:
tree_path = "../../../data/trees"
channels = ("full", "gray_r", "gray_g", "gray_b", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin"]

In [10]:
def read_and_compare(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Euclidean Distance from distance matrices

In [11]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,full,Global with Needleman-Wunsch,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,1.4402,0.9085,0.0,0.8937,1.1881,1.44,1.6967,2.0156
orthologs_myoglobin,1.7112,0.9196,0.0,0.9044,1.669,1.5089,1.9591,2.4938
orthologs_neuroglobin,3.5834,1.8043,0.0,1.1514,1.8423,1.8865,3.8615,4.2052
orthologs_cytoglobin,7.9091,6.3417,0.0,3.1095,5.6928,5.675,8.1297,8.1834


In [12]:
def read_and_linear_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(spearmanr(control_df.values.flatten(), df.values.flatten())[0], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Linear Correlation from distance matrices

In [13]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_linear_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,full,Global with Needleman-Wunsch,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.9912,0.9999,1.0,0.9998,0.9935,0.988,0.9925,0.9951
orthologs_myoglobin,0.9783,0.9999,1.0,0.9998,0.9736,0.9799,0.9765,0.9817
orthologs_neuroglobin,0.9891,0.9999,1.0,0.9992,0.9901,0.9886,0.9881,0.9875
orthologs_cytoglobin,0.5479,0.7775,1.0,0.8623,0.4893,0.4877,0.5435,0.5329


In [14]:
def read_and_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(ppscore.score(
                    pandas.DataFrame({"x": control_df.values.flatten(), "y": df.values.flatten()}), "x", "y")["ppscore"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Predictive Power Score from distance matrices

In [15]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,full,Global with Needleman-Wunsch,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.9545,0.9953,0.9968,0.9944,0.9482,0.9509,0.9538,0.9576
orthologs_myoglobin,0.9252,0.9954,0.9968,0.9925,0.9257,0.9142,0.9242,0.9417
orthologs_neuroglobin,0.9625,0.9876,0.9964,0.9897,0.9702,0.9609,0.9602,0.9568
orthologs_cytoglobin,0.7825,0.8524,0.9913,0.8584,0.7471,0.7615,0.7766,0.7916


In [16]:
def read_and_tree_compare(dataset, channel="full", control="Control with Clustal Omega"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/{control}.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                tree = Tree(f"{path}/{file}", format=1)
                result = control_tree.compare(tree, unrooted=True)
                result_dict[basename] = round(result["norm_rf"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Robinson-foulds distance from generated Trees

In [17]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0833
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
orthologs_neuroglobin,0.0,0.0,0.0833,0.0,0.3333,0.4167,0.0833,0.0833
orthologs_cytoglobin,0.6667,0.5,0.5833,0.0,0.4167,0.5833,0.6667,0.6667


In [18]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Global with Needleman-Wunsch"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0833
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
orthologs_neuroglobin,0.0,0.0,0.0833,0.0,0.3333,0.4167,0.0833,0.0833
orthologs_cytoglobin,0.0,0.25,0.4167,0.6667,0.5833,0.6667,0.3333,0.3333


In [19]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Local with Smith–Waterman"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0833,0.0,0.0833,0.0833
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
orthologs_neuroglobin,0.0,0.0,0.0833,0.0,0.3333,0.4167,0.0833,0.0833
orthologs_cytoglobin,0.25,0.0,0.3333,0.5,0.4167,0.5833,0.25,0.25


In [20]:
def read_and_tree_compare_branches(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/Control with Clustal Omega.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                tree = Tree(f"{path}/{file}", format=1)
                result = control_tree.compare(tree, unrooted=True)
                result_dict[basename] = 1.0 - round(result["source_edges_in_ref"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"MultiScale Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Compatibility branch score from generated Trees

In [21]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare_branches(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0385,0.0,0.0385,0.0385
orthologs_myoglobin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
orthologs_neuroglobin,0.0,0.0,0.0385,0.0,0.1538,0.1923,0.0385,0.0385
orthologs_cytoglobin,0.3077,0.2308,0.2692,0.0,0.1923,0.2692,0.3077,0.3077


In [23]:
def read_and_compare_channels(dataset):
    result_dict = {}
    dfs = {}
    for channel in channels:
        result_dict[channel] = {}
        path = f"{tree_path}/{channel}/{dataset}"
        dfs[channel] = pandas.read_csv(f"{path}/MultiScale Structural Similarity Index Measure.csv", index_col=0)
    for channel in channels:
        control_df = dfs[channel]
        for c, df in dfs.items():
            result_dict[channel][c] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame.from_dict(result_dict, orient='index')
    result_df.index.name = "dataset"
    return result_df.style.background_gradient(axis=None, vmin=result_df.min().min(), vmax=result_df.max().max(), cmap="YlGnBu")

## Features Euclidean distance from distance matrices

In [24]:
for dataset in datasets:
    try:
        correlations = read_and_compare_channels(dataset)
    except FileNotFoundError:
        continue
    else:
        display(HTML(f"<center><h3>{dataset}</h3></center>"))
        display(correlations)        

Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.264,0.06,0.2612,0.5892
gray_r,0.264,0.0,0.2801,0.5218,0.8356
gray_g,0.06,0.2801,0.0,0.2718,0.6015
gray_b,0.2612,0.5218,0.2718,0.0,0.3504
gray_mean,0.5892,0.8356,0.6015,0.3504,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.0727,0.2148,0.2521,0.796
gray_r,0.0727,0.0,0.1983,0.3025,0.841
gray_g,0.2148,0.1983,0.0,0.4626,0.9985
gray_b,0.2521,0.3025,0.4626,0.0,0.5578
gray_mean,0.796,0.841,0.9985,0.5578,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,2.1634,2.1709,0.3173,0.7395
gray_r,2.1634,0.0,0.1034,2.4443,2.8388
gray_g,2.1709,0.1034,0.0,2.449,2.8425
gray_b,0.3173,2.4443,2.449,0.0,0.4407
gray_mean,0.7395,2.8388,2.8425,0.4407,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,2.907,2.9686,0.2687,0.6426
gray_r,2.907,0.0,0.3279,3.1482,3.4364
gray_g,2.9686,0.3279,0.0,3.2162,3.5004
gray_b,0.2687,3.1482,3.2162,0.0,0.5021
gray_mean,0.6426,3.4364,3.5004,0.5021,0.0
