# Comparing channels

## Nucleotides
- R: sequential
- G: complementary
- B: not match at all

## Proteins
- R: sequential by ProtSub
- G: sequential match
- B: sequential by Sneath similarity

In [19]:
import pandas
import os
import numpy
from IPython.core.display import display, HTML
from ete3 import Tree
from ete3.parser.newick import NewickError

  from IPython.core.display import display, HTML


In [20]:
tree_path = "../../data/trees"
control_path = "../../data/afproject/swisstree"
channels = ("P/full", "P/gray_r", "P/gray_g", "P/gray_b")
datasets = ["ST001", "ST002", "ST003", "ST004", "ST005", "ST007", "ST008", "ST009", "ST010", "ST011", "ST012"]

In [21]:
def do_rf(control, compare):
    return round(control.compare(compare, unrooted=True)["norm_rf"], 4)

def do_branch_score(control, compare):
    return 1.0 - round(control.compare(compare, unrooted=True)["source_edges_in_ref"], 4)

def read_and_compare(dataset, channel, score_func):
    result_dict = dict(
        algorithm=[],
        score=[],
        channel=[],
        dataset=[]
    )
    path = f"{tree_path}/{channel}/{dataset}"
    control = Tree(f"{control_path}/{dataset}/tree.newick", format=1)
    for file in os.listdir(path):
        if file.endswith(".nw"):
            basename = ".".join(file.split(".")[0:-1])
            try:
                compare = Tree(f"{path}/{file}", format=1)
                result_dict["algorithm"] += [basename]
                result_dict["score"] += [score_func(control, compare)]
                result_dict["channel"] += [channel]
                result_dict["dataset"] += [dataset]
            except (FileNotFoundError, NewickError) as e:
                print(f"Error with file {path}/{file}")
    return result_dict

def pprintdf(score_func):
    result_dict = dict(
        algorithm=[],
        score=[],
        channel=[],
        dataset=[]
    )
    for channel in channels:
        for dataset in datasets:
            d = read_and_compare(dataset, channel, score_func)
            result_dict["algorithm"] += d["algorithm"]
            result_dict["score"] += d["score"]        
            result_dict["channel"] += d["channel"]        
            result_dict["dataset"] += d["dataset"]
    df = pandas.DataFrame(result_dict)
    table = df.pivot(index=["algorithm", "channel"], columns="dataset", values="score")
    return table.head(50)

In [22]:
pprintdf(do_rf)

Unnamed: 0_level_0,dataset,ST001,ST002,ST003,ST004,ST005,ST007,ST008,ST009,ST010,ST011,ST012
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Control with Clustal Omega,P/full,0.2099,0.1333,0.4167,0.1875,0.1154,0.2727,0.5641,0.6111,0.1935,0.3013,0.3529
Control with Clustal Omega,P/gray_b,0.2099,0.1333,0.4167,0.1875,0.1154,0.2727,0.5641,0.6111,0.1935,0.3013,0.3529
Control with Clustal Omega,P/gray_g,0.2099,0.1333,0.4167,0.1875,0.1154,0.2727,0.5641,0.6111,0.1935,0.3013,0.3529
Control with Clustal Omega,P/gray_r,0.2099,0.1333,0.4167,0.1875,0.1154,0.2727,0.5641,0.6111,0.1935,0.3013,0.3529
Global with Needleman-Wunsch,P/full,0.4074,0.2444,0.3889,0.3482,0.6923,0.5152,0.7949,0.7222,0.6452,0.6538,0.4118
Global with Needleman-Wunsch,P/gray_b,0.4074,0.2444,0.3889,0.3482,0.6923,0.5152,0.7949,0.7222,0.6452,0.6538,0.4118
Global with Needleman-Wunsch,P/gray_g,0.4074,0.2444,0.3889,0.3482,0.6923,0.5152,0.7949,0.7222,0.6452,0.6538,0.4118
Global with Needleman-Wunsch,P/gray_r,0.4074,0.2444,0.3889,0.3482,0.6923,0.5152,0.7949,0.7222,0.6452,0.6538,0.4118
Local with Smith–Waterman,P/full,0.2099,0.1778,0.3333,0.2946,0.3462,0.3333,0.7436,0.5556,0.4839,0.4231,0.4118
Local with Smith–Waterman,P/gray_b,0.2099,0.1778,0.3333,0.2946,0.3462,0.3333,0.7436,0.5556,0.4839,0.4231,0.4118


In [25]:
pprintdf(do_branch_score)

Unnamed: 0_level_0,dataset,ST001,ST002,ST003,ST004,ST005,ST007,ST008,ST009,ST010,ST011,ST012
algorithm,channel,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Control with Clustal Omega,P/full,0.0361,0.0,0.0676,0.0929,0.0556,0.0594,0.275,0.2973,0.0938,0.1497,0.1389
Control with Clustal Omega,P/gray_b,0.0361,0.0,0.0676,0.0929,0.0556,0.0594,0.275,0.2973,0.0938,0.1497,0.1389
Control with Clustal Omega,P/gray_g,0.0361,0.0,0.0676,0.0929,0.0556,0.0594,0.275,0.2973,0.0938,0.1497,0.1389
Control with Clustal Omega,P/gray_r,0.0361,0.0,0.0676,0.0929,0.0556,0.0594,0.275,0.2973,0.0938,0.1497,0.1389
Global with Needleman-Wunsch,P/full,0.1325,0.0543,0.0541,0.1726,0.3333,0.1782,0.3875,0.3514,0.3125,0.3248,0.1667
Global with Needleman-Wunsch,P/gray_b,0.1325,0.0543,0.0541,0.1726,0.3333,0.1782,0.3875,0.3514,0.3125,0.3248,0.1667
Global with Needleman-Wunsch,P/gray_g,0.1325,0.0543,0.0541,0.1726,0.3333,0.1782,0.3875,0.3514,0.3125,0.3248,0.1667
Global with Needleman-Wunsch,P/gray_r,0.1325,0.0543,0.0541,0.1726,0.3333,0.1782,0.3875,0.3514,0.3125,0.3248,0.1667
Local with Smith–Waterman,P/full,0.0361,0.0217,0.027,0.146,0.1667,0.0891,0.3625,0.2703,0.2344,0.2102,0.1667
Local with Smith–Waterman,P/gray_b,0.0361,0.0217,0.027,0.146,0.1667,0.0891,0.3625,0.2703,0.2344,0.2102,0.1667


In [16]:
control_path = "../../data/afproject"
channels = ("N/full",)#"N/gray_r", "N/gray_g", "N/gray_b")
datasets = ["fish_mito"]

In [17]:
pprintdf(do_rf)

Unnamed: 0_level_0,dataset,fish_mito
algorithm,channel,Unnamed: 2_level_1
Control with Clustal Omega,N/full,0.0909
Global with Needleman-Wunsch,N/full,0.2273
Local with Smith–Waterman,N/full,0.0909
Resized Structural Similarity Index Measure,N/full,0.8636
Windowed MultiScale Structural Similarity Index Measure,N/full,0.9545


In [18]:
pprintdf(do_branch_score)

Unnamed: 0_level_0,dataset,fish_mito
algorithm,channel,Unnamed: 2_level_1
Control with Clustal Omega,N/full,0.0435
Global with Needleman-Wunsch,N/full,0.1087
Local with Smith–Waterman,N/full,0.0435
Resized Structural Similarity Index Measure,N/full,0.413
Windowed MultiScale Structural Similarity Index Measure,N/full,0.4565
