# Comparing channels

- R: sequential
- G: reverse complementary
- B: not match at all

In [1]:
import ppscore
import pandas
import os
import numpy
from scipy.stats import spearmanr
from IPython.core.display import display, HTML
from ete3 import Tree

  from IPython.core.display import display, HTML


In [2]:
tree_path = "../../../data/trees"
channels = ("full", "gray_r", "gray_g", "gray_b", "gray_mean")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin"]

In [3]:
def read_and_compare(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Euclidean Distance from distance matrices

In [4]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.9085,1.5993,0.0,0.8937,1.0651,1.1226,1.7026,3.4182
orthologs_myoglobin,0.9196,1.6627,0.0,0.9044,1.0055,1.0992,1.7537,3.7721
orthologs_neuroglobin,1.8043,4.432,0.0,1.1514,5.5989,3.6204,4.5386,5.4547
orthologs_cytoglobin,6.3417,7.8124,0.0,3.1095,5.0778,8.9521,8.2116,9.9477


In [5]:
def read_and_linear_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(spearmanr(control_df.values.flatten(), df.values.flatten())[0], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Linear Correlation from distance matrices

In [6]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_linear_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.9999,0.9925,1.0,0.9998,0.9974,0.9982,0.9944,0.9952
orthologs_myoglobin,0.9999,0.9931,1.0,0.9998,0.9959,0.9969,0.9925,0.9834
orthologs_neuroglobin,0.9999,0.9886,1.0,0.9992,0.9764,0.9968,0.9887,0.9848
orthologs_cytoglobin,0.7775,0.5167,1.0,0.8623,0.5003,0.3714,0.5086,0.5354


In [7]:
def read_and_correlate(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_df = pandas.read_csv(f"{path}/Control with Clustal Omega.csv", index_col=0)
    for file in os.listdir(path):
        if file.endswith(".csv"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                df = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict[basename] = round(ppscore.score(
                    pandas.DataFrame({"x": control_df.values.flatten(), "y": df.values.flatten()}), "x", "y")["ppscore"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Predictive Power Score from distance matrices

In [8]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_correlate(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,full,Control with Clustal Omega,Local with Smith–Waterman,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.9953,0.9704,0.9968,0.9944,0.9729,0.9734,0.9762,0.9667
orthologs_myoglobin,0.9954,0.9474,0.9968,0.9925,0.9561,0.9468,0.9457,0.9452
orthologs_neuroglobin,0.9876,0.9651,0.9964,0.9897,0.9643,0.9906,0.964,0.9519
orthologs_cytoglobin,0.8524,0.7466,0.9913,0.8584,0.7055,0.719,0.7575,0.7855


In [9]:
def read_and_tree_compare(dataset, channel="full", control="Control with Clustal Omega"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/{control}.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                tree = Tree(f"{path}/{file}", format=1)
                result = control_tree.compare(tree, unrooted=True)
                result_dict[basename] = round(result["norm_rf"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Robinson-foulds distance from generated Trees

In [10]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0833
orthologs_myoglobin,0.0,0.0,0.1667,0.0,0.0833,0.0833,0.0833,0.0833
orthologs_neuroglobin,0.0,0.0,0.3333,0.0,0.0833,0.25,0.1667,0.1667
orthologs_cytoglobin,0.6667,0.5,0.6667,0.0,0.6667,0.75,0.6667,0.6667


In [11]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Global with Needleman-Wunsch"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0833
orthologs_myoglobin,0.0,0.0,0.1667,0.0,0.0833,0.0833,0.0833,0.0833
orthologs_neuroglobin,0.0,0.0,0.3333,0.0,0.0833,0.25,0.1667,0.1667
orthologs_cytoglobin,0.0,0.25,0.5,0.6667,0.75,0.8333,0.5,0.5


In [12]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare(dataset, channel, "Local with Smith–Waterman"))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0833
orthologs_myoglobin,0.0,0.0,0.1667,0.0,0.0833,0.0833,0.0833,0.0833
orthologs_neuroglobin,0.0,0.0,0.3333,0.0,0.0833,0.25,0.1667,0.1667
orthologs_cytoglobin,0.25,0.0,0.5,0.5,0.5833,0.75,0.5,0.5


In [13]:
def read_and_tree_compare_branches(dataset, channel="full"):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    control_tree = Tree(f"{path}/Control with Clustal Omega.nw", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                tree = Tree(f"{path}/{file}", format=1)
                result = control_tree.compare(tree, unrooted=True)
                result_dict[basename] = 1.0 - round(result["source_edges_in_ref"], 4)
            except FileNotFoundError:
                pass
    result_df = pandas.DataFrame(result_dict, index=[dataset])
    result_df.rename(columns={"Structural Similarity Index Measure": channel}, inplace=True)
    result_df.index.name = "dataset"
    return result_df

## Compatibility branch score from generated Trees

In [14]:
sum_dfs = []
for channel in channels:
    dfs = []
    for dataset in datasets:
        dfs.append(read_and_tree_compare_branches(dataset, channel))
    sum_dfs.append(pandas.concat(dfs))
pandas.concat(sum_dfs, axis=1).T.reset_index().drop_duplicates(["index"]).set_index("index").T

index,Global with Needleman-Wunsch,Local with Smith–Waterman,full,Control with Clustal Omega,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
orthologs_hemoglobin_beta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0385
orthologs_myoglobin,0.0,0.0,0.0769,0.0,0.0385,0.0385,0.0385,0.0385
orthologs_neuroglobin,0.0,0.0,0.1538,0.0,0.0385,0.1154,0.0769,0.0769
orthologs_cytoglobin,0.3077,0.2308,0.3077,0.0,0.3077,0.3462,0.3077,0.3077


In [15]:
def read_and_compare_channels(dataset):
    result_dict = {}
    dfs = {}
    for channel in channels:
        result_dict[channel] = {}
        path = f"{tree_path}/{channel}/{dataset}"
        dfs[channel] = pandas.read_csv(f"{path}/Structural Similarity Index Measure.csv", index_col=0)
    for channel in channels:
        control_df = dfs[channel]
        for c, df in dfs.items():
            result_dict[channel][c] = round(numpy.sqrt(numpy.sum((control_df.values - df.values)**2)), 4)
    result_df = pandas.DataFrame.from_dict(result_dict, orient='index')
    result_df.index.name = "dataset"
    return result_df.style.background_gradient(axis=None, vmin=result_df.min().min(), vmax=result_df.max().max(), cmap="YlGnBu")

## Features Euclidean distance from distance matrices

In [16]:
for dataset in datasets:
    try:
        correlations = read_and_compare_channels(dataset)
    except FileNotFoundError:
        continue
    else:
        display(HTML(f"<center><h3>{dataset}</h3></center>"))
        display(correlations)        

Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.6062,0.5454,0.1187,1.862
gray_r,0.6062,0.0,0.0822,0.692,2.4164
gray_g,0.5454,0.0822,0.0,0.6315,2.3541
gray_b,0.1187,0.692,0.6315,0.0,1.7626
gray_mean,1.862,2.4164,2.3541,1.7626,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,0.7057,0.6437,0.1086,2.132
gray_r,0.7057,0.0,0.1462,0.7944,2.8055
gray_g,0.6437,0.1462,0.0,0.7298,2.7212
gray_b,0.1086,0.7944,0.7298,0.0,2.044
gray_mean,2.132,2.8055,2.7212,2.044,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,1.8333,2.2603,0.2576,1.6154
gray_r,1.8333,0.0,4.0521,1.6025,1.2608
gray_g,2.2603,4.0521,0.0,2.5087,3.7589
gray_b,0.2576,1.6025,2.5087,0.0,1.388
gray_mean,1.6154,1.2608,3.7589,1.388,0.0


Unnamed: 0_level_0,full,gray_r,gray_g,gray_b,gray_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
full,0.0,4.0741,3.4407,0.5033,2.4768
gray_r,4.0741,0.0,5.6582,4.566,6.5303
gray_g,3.4407,5.6582,0.0,3.3718,4.0236
gray_b,0.5033,4.566,3.3718,0.0,1.985
gray_mean,2.4768,6.5303,4.0236,1.985,0.0
