# Comparing channels

## Nucleotides
- R: sequential
- G: complementary
- B: not match at all

## Proteins
- sequential by BLOSUM62

In [1]:
import ppscore
import pandas
import os
import numpy
from scipy.stats import spearmanr
from IPython.core.display import display, HTML
from ete3 import Tree

  from IPython.core.display import display, HTML


In [129]:
tree_path = "../../data/trees"
channels = ("N/full", "N/gray_r", "N/gray_g", "N/gray_b", "P/full")
datasets = ["orthologs_hemoglobin_beta", "orthologs_myoglobin", "orthologs_neuroglobin", "orthologs_cytoglobin", "orthologs_androglobin", "indelible"]

In [171]:
def do_euclidean(control, compare):
    return round(numpy.sqrt(numpy.sum((control.values - compare.values)**2)), 4)

def do_spearman(control, compare):
    return round(spearmanr(control.values.flatten(), compare.values.flatten())[0], 4)

def do_ppscore(control, compare):
    return round(ppscore.score(
                    pandas.DataFrame({"x": control.values.flatten(), "y": compare.values.flatten()}), "x", "y")["ppscore"], 4)

def do_rf(control, compare):
    return round(control.compare(compare, unrooted=True)["norm_rf"], 4)

def do_branch_score(control, compare):
    return 1.0 - round(control.compare(compare, unrooted=True)["source_edges_in_ref"], 4)


def read_and_compare(dataset, channel, score_func, control_name):
    result_dict = {}
    path = f"{tree_path}/{channel}/{dataset}"
    ends = control_name.split(".")[-1]
    if ends == "nw":
        control = Tree(f"{path}/{control_name}", format=1)
    else:
        control = pandas.read_csv(f"{path}/{control_name}", index_col=0)
    for file in os.listdir(path):
        if file.endswith(ends):
            basename = ".".join(file.split(".")[0:-1])
            try:
                if ends == "nw":
                    compare = Tree(f"{path}/{file}", format=1)
                else:
                    compare = pandas.read_csv(f"{path}/{file}", index_col=0)
                result_dict["algorithm"] = [basename]
                result_dict["score"] = [score_func(control, compare)]
                result_dict["channel"] = [channel]
                result_dict["dataset"] = [dataset]
            except FileNotFoundError:
                pass
    return result_dict

def pprintdf(score_func, control_name="Control with Clustal Omega.csv"):
    result_dict = dict(
    algorithm=[],
    score=[],
    channel=[],
    dataset=[]
    )
    for channel in channels:
        for dataset in datasets:
            d = read_and_compare(dataset, channel, score_func, control_name)
            result_dict["algorithm"] += d["algorithm"]
            result_dict["score"] += d["score"]        
            result_dict["channel"] += d["channel"]        
            result_dict["dataset"] += d["dataset"]
    df = pandas.DataFrame(result_dict)
    table = df.pivot(index=["algorithm", "channel"], columns="dataset", values="score")
    return table.head(50)

## Euclidean Distance from distance matrices

In [160]:
pprintdf(do_euclidean)

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,0.0,0.0,,,,
Control with Clustal Omega,N/gray_b,0.0,0.0,,0.0,,
Control with Clustal Omega,N/gray_g,0.0,0.0,,0.0,,
Control with Clustal Omega,N/gray_r,0.0,0.0,,,,
Local with Smith–Waterman,N/full,,,3.1095,,0.9044,1.1514
Local with Smith–Waterman,N/gray_b,,,3.1095,,0.9044,1.1514
Local with Smith–Waterman,N/gray_g,,,3.1095,,0.9044,1.1514
Local with Smith–Waterman,N/gray_r,,,3.1095,,0.9044,1.1514
Local with Smith–Waterman,P/full,4.3137,7.8005,7.3833,,7.3591,7.6203
Unrestricted Sliced Structural Similarity Index Measure,N/full,,,,1.8705,,


## Linear Correlation from distance matrices

In [161]:
pprintdf(do_spearman)



Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,1.0,1.0,,,,
Control with Clustal Omega,N/gray_b,1.0,1.0,,1.0,,
Control with Clustal Omega,N/gray_g,1.0,1.0,,1.0,,
Control with Clustal Omega,N/gray_r,1.0,1.0,,,,
Local with Smith–Waterman,N/full,,,0.8623,,0.9998,0.9992
Local with Smith–Waterman,N/gray_b,,,0.8623,,0.9998,0.9992
Local with Smith–Waterman,N/gray_g,,,0.8623,,0.9998,0.9992
Local with Smith–Waterman,N/gray_r,,,0.8623,,0.9998,0.9992
Local with Smith–Waterman,P/full,0.2445,0.9605,0.8168,,0.9798,0.9887
Unrestricted Sliced Structural Similarity Index Measure,N/full,,,,0.9984,,


## Predictive Power Score from distance matrices

In [162]:
pprintdf(do_ppscore)

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,0.9998,0.9977,,,,
Control with Clustal Omega,N/gray_b,0.9998,0.9977,,0.9968,,
Control with Clustal Omega,N/gray_g,0.9998,0.9977,,0.9968,,
Control with Clustal Omega,N/gray_r,0.9998,0.9977,,,,
Local with Smith–Waterman,N/full,,,0.8584,,0.9925,0.9897
Local with Smith–Waterman,N/gray_b,,,0.8584,,0.9925,0.9897
Local with Smith–Waterman,N/gray_g,,,0.8584,,0.9925,0.9897
Local with Smith–Waterman,N/gray_r,,,0.8584,,0.9925,0.9897
Local with Smith–Waterman,P/full,0.9643,0.9359,0.8245,,0.8997,0.9757
Unrestricted Sliced Structural Similarity Index Measure,N/full,,,,0.9812,,


## Robinson-foulds distance from generated Trees

In [164]:
pprintdf(do_rf, control_name="Control with Clustal Omega.nw")

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,0.0,0.0,,,,
Control with Clustal Omega,N/gray_b,0.0,0.0,,0.0,,
Control with Clustal Omega,N/gray_g,0.0,0.0,,0.0,,0.0
Control with Clustal Omega,N/gray_r,0.0,0.0,,,,
Greedy Sliced Structural Similarity Index Measure,N/full,,,,0.0,,
Greedy Sliced Structural Similarity Index Measure,N/gray_r,,,,0.0,,
Greedy Sliced Structural Similarity Index Measure,P/full,,,,1.0,,
Universal Quality Index,N/full,,,0.9167,,0.0,1.0
Universal Quality Index,N/gray_b,,,0.9167,,0.1667,0.0833
Universal Quality Index,N/gray_g,,,0.9167,,0.0833,


In [166]:
pprintdf(do_rf, control_name="Global with Needleman-Wunsch.nw")

NewickError: Unexisting tree file or Malformed newick tree structure.
You may want to check other newick loading flags like 'format' or 'quoted_node_names'.

In [167]:
pprintdf(do_rf, control_name="Local with Smith–Waterman.nw")

NewickError: Unexisting tree file or Malformed newick tree structure.
You may want to check other newick loading flags like 'format' or 'quoted_node_names'.

## Compatibility branch score from generated Trees

In [172]:
pprintdf(do_branch_score, control_name="Control with Clustal Omega.nw")

Unnamed: 0_level_0,dataset,indelible,orthologs_androglobin,orthologs_cytoglobin,orthologs_hemoglobin_beta,orthologs_myoglobin,orthologs_neuroglobin
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Control with Clustal Omega,N/full,0.0,0.0,,,,
Control with Clustal Omega,N/gray_b,0.0,0.0,,0.0,,
Control with Clustal Omega,N/gray_g,0.0,0.0,,0.0,,0.0
Control with Clustal Omega,N/gray_r,0.0,0.0,,,,
Greedy Sliced Structural Similarity Index Measure,N/full,,,,0.0,,
Greedy Sliced Structural Similarity Index Measure,N/gray_r,,,,0.0,,
Greedy Sliced Structural Similarity Index Measure,P/full,,,,0.4615,,
Universal Quality Index,N/full,,,0.4231,,0.0,0.4615
Universal Quality Index,N/gray_b,,,0.4231,,0.0769,0.0385
Universal Quality Index,N/gray_g,,,0.4231,,0.0385,
