# Comparing channels

## Nucleotides
- R: sequential
- G: complementary
- B: not match at all

## Proteins
- sequential by BLOSUM62

In [6]:
import ppscore
import pandas
import os
import numpy
from scipy.stats import spearmanr
from IPython.core.display import display, HTML
from ete3 import Tree
from ete3.parser.newick import NewickError

  from IPython.core.display import display, HTML


In [21]:
tree_path = "../../data/trees"
channels = ("N/full", "N/gray_r", "N/gray_g", "N/gray_b", "P/full")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin", "indelible"]

In [8]:
def do_euclidean(control, compare):
    return round(numpy.sqrt(numpy.sum((control.values - compare.values)**2)), 4)

def do_spearman(control, compare):
    return round(spearmanr(control.values.flatten(), compare.values.flatten())[0], 4)

def do_ppscore(control, compare):
    return round(ppscore.score(
                    pandas.DataFrame({"x": control.values.flatten(), "y": compare.values.flatten()}), "x", "y")["ppscore"], 4)

def do_rf(control, compare):
    return round(control.compare(compare, unrooted=True)["norm_rf"], 4)

def do_branch_score(control, compare):
    return 1.0 - round(control.compare(compare, unrooted=True)["source_edges_in_ref"], 4)


def read_and_compare(dataset, channel, score_func, control_name):
    result_dict = dict(
        algorithm=[],
        score=[],
        channel=[],
        dataset=[]
    )
    path = f"{tree_path}/{channel}/{dataset}"
    ends = control_name.split(".")[-1]
    if ends == "nw":
        control = Tree(f"{path}/{control_name}", format=1)
    else:
        control = pandas.read_csv(f"{path}/{control_name}", index_col=0)
    for file in os.listdir(path):
        if file.endswith(ends):
            basename = ".".join(file.split(".")[0:-1])
            try:
                if ends == "nw":
                    compare = Tree(f"{path}/{file}", format=1)
                else:
                    compare = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict["algorithm"] += [basename]
                result_dict["score"] += [score_func(control, compare)]
                result_dict["channel"] += [channel]
                result_dict["dataset"] += [dataset]
            except (FileNotFoundError, NewickError) as e:
                print(f"Error with file {path}/{file}")
    return result_dict

def pprintdf(score_func, control_name="Control with Clustal Omega.csv"):
    result_dict = dict(
        algorithm=[],
        score=[],
        channel=[],
        dataset=[]
    )
    for channel in channels:
        for dataset in datasets:
            d = read_and_compare(dataset, channel, score_func, control_name)
            result_dict["algorithm"] += d["algorithm"]
            result_dict["score"] += d["score"]        
            result_dict["channel"] += d["channel"]        
            result_dict["dataset"] += d["dataset"]
    df = pandas.DataFrame(result_dict)
    table = df.pivot(index=["algorithm", "channel"], columns="dataset", values="score")
    return table.head(50)

## Euclidean Distance from distance matrices

In [12]:
pprintdf(do_euclidean)

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_b,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_g,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_r,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,P/full,0.0,0.0,0.0,0.0,0.0,0.0
Deep Search with Annoy,N/full,,,10.1226,4.8645,4.809,6.0855
Deep Search with Annoy,N/gray_b,,,11.5941,7.2564,7.848,8.3794
Deep Search with Annoy,N/gray_g,,,11.0523,6.5955,6.829,7.4448
Deep Search with Annoy,N/gray_r,,,11.0721,6.6844,7.0801,7.321
Global with Needleman-Wunsch,N/full,,8.2855,6.3417,0.9085,0.9196,1.8043


## Linear Correlation from distance matrices

In [13]:
pprintdf(do_spearman)

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,1.0,1.0,1.0,1.0,1.0,1.0
Control with Clustal Omega,N/gray_b,1.0,1.0,1.0,1.0,1.0,1.0
Control with Clustal Omega,N/gray_g,1.0,1.0,1.0,1.0,1.0,1.0
Control with Clustal Omega,N/gray_r,1.0,1.0,1.0,1.0,1.0,1.0
Control with Clustal Omega,P/full,1.0,1.0,1.0,1.0,1.0,1.0
Deep Search with Annoy,N/full,,,0.2521,0.2257,0.3322,0.1127
Deep Search with Annoy,N/gray_b,,,0.2796,0.2051,0.3149,0.1917
Deep Search with Annoy,N/gray_g,,,0.2404,0.2239,0.2589,0.0659
Deep Search with Annoy,N/gray_r,,,0.2372,0.2155,0.2621,0.0881
Global with Needleman-Wunsch,N/full,,0.9054,0.7775,0.9999,0.9999,0.9999


## Predictive Power Score from distance matrices

In [14]:
pprintdf(do_ppscore)

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,0.9998,0.9977,0.9913,0.9968,0.9968,0.9964
Control with Clustal Omega,N/gray_b,0.9998,0.9977,0.9913,0.9968,0.9968,0.9964
Control with Clustal Omega,N/gray_g,0.9998,0.9977,0.9913,0.9968,0.9968,0.9964
Control with Clustal Omega,N/gray_r,0.9998,0.9977,0.9913,0.9968,0.9968,0.9964
Control with Clustal Omega,P/full,0.9992,0.9975,0.995,1.0,1.0,0.998
Deep Search with Annoy,N/full,,,0.6626,0.431,0.351,0.293
Deep Search with Annoy,N/gray_b,,,0.6969,0.4187,0.3015,0.3832
Deep Search with Annoy,N/gray_g,,,0.694,0.3358,0.3403,0.3078
Deep Search with Annoy,N/gray_r,,,0.689,0.348,0.3321,0.3153
Global with Needleman-Wunsch,N/full,,0.9773,0.8524,0.9953,0.9954,0.9876


## Robinson-foulds distance from generated Trees

In [15]:
pprintdf(do_rf, control_name="Control with Clustal Omega.nw")

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_b,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_g,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_r,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,P/full,0.0,0.0,0.0,0.0,0.0,0.0
Deep Search with Annoy,N/full,,,1.0,0.9167,0.9167,0.8333
Deep Search with Annoy,N/gray_b,,,1.0,1.0,1.0,0.9167
Deep Search with Annoy,N/gray_g,,,1.0,1.0,1.0,0.9167
Deep Search with Annoy,N/gray_r,,,0.9167,1.0,0.9167,1.0
Global with Needleman-Wunsch,N/full,,0.8333,0.6667,0.0,0.0,0.0


In [19]:
pprintdf(do_rf, control_name="Global with Needleman-Wunsch.nw")

Unnamed: 0_level_0,dataset,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Control with Clustal Omega,N/full,0.8333,0.6667,0.0,0.0,0.0
Control with Clustal Omega,N/gray_b,0.8333,0.6667,0.0,0.0,0.0
Control with Clustal Omega,N/gray_g,0.8333,0.6667,0.0,0.0,0.0
Control with Clustal Omega,N/gray_r,0.8333,0.6667,0.0,0.0,0.0
Control with Clustal Omega,P/full,0.9167,0.6667,0.25,0.25,0.6667
Deep Search with Annoy,N/full,,1.0,0.9167,0.9167,0.8333
Deep Search with Annoy,N/gray_b,,1.0,1.0,1.0,0.9167
Deep Search with Annoy,N/gray_g,,1.0,1.0,1.0,0.9167
Deep Search with Annoy,N/gray_r,,1.0,1.0,0.9167,1.0
Global with Needleman-Wunsch,N/full,0.0,0.0,0.0,0.0,0.0


In [20]:
pprintdf(do_rf, control_name="Local with Smith–Waterman.nw")

Unnamed: 0_level_0,dataset,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Control with Clustal Omega,N/full,0.5,0.5,0.0,0.0,0.0
Control with Clustal Omega,N/gray_b,0.5,0.5,0.0,0.0,0.0
Control with Clustal Omega,N/gray_g,0.5,0.5,0.0,0.0,0.0
Control with Clustal Omega,N/gray_r,0.5,0.5,0.0,0.0,0.0
Control with Clustal Omega,P/full,0.6667,0.5833,0.25,0.25,0.4167
Deep Search with Annoy,N/full,,1.0,0.9167,0.9167,0.8333
Deep Search with Annoy,N/gray_b,,1.0,1.0,1.0,0.9167
Deep Search with Annoy,N/gray_g,,1.0,1.0,1.0,0.9167
Deep Search with Annoy,N/gray_r,,1.0,1.0,0.9167,1.0
Global with Needleman-Wunsch,N/full,0.4167,0.25,0.0,0.0,0.0


## Compatibility branch score from generated Trees

In [22]:
pprintdf(do_branch_score, control_name="Control with Clustal Omega.nw")

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_b,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_g,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,N/gray_r,0.0,0.0,0.0,0.0,0.0,0.0
Control with Clustal Omega,P/full,0.0,0.0,0.0,0.0,0.0,0.0
Deep Search with Annoy,N/full,,,0.4615,0.4231,0.4231,0.3846
Deep Search with Annoy,N/gray_b,,,0.4615,0.4615,0.4615,0.4231
Deep Search with Annoy,N/gray_g,,,0.4615,0.4615,0.4615,0.4231
Deep Search with Annoy,N/gray_r,,,0.4231,0.4615,0.4231,0.4615
Global with Needleman-Wunsch,N/full,,0.3846,0.3077,0.0,0.0,0.0
