In [47]:
import dendropy
import numpy as np
import statistics
from collections import Counter
from collections import defaultdict

from io import StringIO
from re import *

import json
import datetime as dt

## Prune full tree to first epoch

In [121]:
tree = dendropy.Tree.get(path="full_epidemic_tree.mcc", schema='nexus')

In [122]:
for node in tree.preorder_node_iter():
    if node.distance_from_root() == 0:
        root = node

In [123]:
keep = []
all_seqs = []
for node in tree.leaf_node_iter():
    if node.distance_from_root() <= 0.3866:
        keep.append(node.taxon.label)
    all_seqs.append(node.taxon.label)
    

In [124]:
print(len(keep))
print(len(all_seqs))

214
1284


In [125]:
tree.retain_taxa_with_labels(keep)

In [126]:
count = 0
for i in tree.leaf_iter():
    count += 1
print(count)

214


  


In [229]:
tree.write(path="pruned_tree.nexus", schema="nexus")

## Branch length

- max_H = Sum of branch lengths between root and farthest leaf
- min_H = sum of branch lengths between root and closest leaf
- a_BL_mean = mean length of all branches
- a_BL_median = median
- a_BL_var = variance
- e_BL_mean = mean length of external branches
- e_BL_median = median len of external branches
- e_BL_var = variance
- i_BL_mean = internal branch len mean
- i_BL_median
- i_BL_var

In [127]:
#max_h and min_h
path_lens = []
for i in tree.leaf_node_iter():
    path_lens.append(i.distance_from_root())
    
print(max(path_lens))
print(min(path_lens))

0.3859675963699254
0.11747444568536669


In [128]:
# all branches
all_branch = []
for i in tree.preorder_node_iter():
    if i.edge_length:
        all_branch.append(i.edge_length)
    
print(np.mean(all_branch))
print(np.median(all_branch))
print(np.var(all_branch))

0.035080050967117304
0.021939933283573243
0.0013770629364285776


In [129]:
#external branches
e_branch_lens = []
for i in tree.leaf_node_iter():
    e_branch_lens.append(i.edge_length)
    
print(np.mean(e_branch_lens))
print(np.median(e_branch_lens))
print(np.var(e_branch_lens))

0.051805151396388485
0.04441308703686353
0.001384042360694057


In [130]:
# internal branches
i_branch_lens = []
for i in tree.preorder_node_iter():
    if not i.taxon and i.edge_length:
        i_branch_lens.append(i.edge_length)
        
print(np.mean(i_branch_lens))
print(np.median(i_branch_lens))
print(np.var(i_branch_lens))

0.018197166571532235
0.0072969106972364806
0.0008026179459713262


In [131]:
# ratios
print(np.mean(i_branch_lens)/np.mean(e_branch_lens))
print(np.median(i_branch_lens)/np.median(e_branch_lens))
print(np.var(i_branch_lens)/np.var(e_branch_lens))

0.35126171975246506
0.16429640865044878
0.5799085120262046


## Topology set

In [340]:
node_to_all_children = defaultdict(list)
  
for node in tree.postorder_node_iter():
    if not node.taxon:
        node_to_all_children[node].extend(node._child_nodes)
        for child in node._child_nodes:
            node_to_all_children[node].extend(node_to_all_children[child])
    else:
        node_to_all_children[node] = []




In [346]:
differences = []

for node in tree.preorder_node_iter():
    if not node.taxon:
        direct_children = node._child_nodes
        left = direct_children[0]
        right = direct_children[1]
        left_count = 0
        right_count = 0
        if not left.taxon :
            for query in node_to_all_children[left]:
                if query.taxon:
                    left_count += 1
        else:
            left_count += 1
            
        if not right.taxon:
            for query in node_to_all_children[right]:
                if query.taxon:
                    right_count += 1
        else:
            right_count += 1

        differences.append(abs(left_count - right_count))
        
print(sum(differences))

2168


In [345]:
#staircaseness

uneven = 0
ratio_list = []

for node in tree.preorder_node_iter():
    if not node.taxon:
        direct_children = node._child_nodes
        left = direct_children[0]
        right = direct_children[1]
        left_count = 0
        right_count = 0
        if not left.taxon :
            for query in node_to_all_children[left]:
                if query.taxon:
                    left_count += 1
        else:
            left_count += 1
            
        if not right.taxon:
            for query in node_to_all_children[right]:
                if query.taxon:
                    right_count += 1
        else:
            right_count += 1

                
        if left_count != right_count:
            uneven += 1
        
        
        if left_count < right_count:
            ratio = left_count/right_count
        else:
            ratio = right_count/left_count
       


        ratio_list.append(ratio)
            
print(uneven/len(all_internals))
print(np.mean(ratio_list))

0.704225352112676
0.5386550674701448


In [349]:
leaf_paths = []
def get_full_path(lst,node, count):
    if node == root:
        lst.append(count)
        return
    else:
        count += 1
        get_full_path(lst, node.parent_node, count)

            
for leaf in tree.leaf_iter():
    get_full_path(leaf_paths,leaf,0)
    
print(sum(leaf_paths))

3204


  # This is added back by InteractiveShellApp.init_path()


In [168]:
#Width ones

all_paths = []
for node in tree.preorder_node_iter():
    get_full_path(all_paths, node, 0)

depths = []
for i in all_paths:
    depth = i+1
    depths.append(depth)
max_depth = max(depths)
    
widths = Counter(depths)
max_width = max(widths.items(), key = lambda k : k[1])[0]

WD_ratio = max_width/max_depth
print(WD_ratio)


0.75
18


In [197]:
index = 0
diffs = []
for depth, count in widths.items():
    if index > 0:
        width_diff = abs(count - widths[index-1])
        diffs.append(width_diff)
    index += 1

print(max(diffs))

12


In [350]:
##ladder ones

def go_up_ladder(node, node_set, ladder, ladder_list):
    
    if node == root:
        return
            
    if len(node.sibling_nodes()) == 1:
        if node.taxon:
            if not node.sibling_nodes()[0].taxon:
                if node.parent_node not in node_set:
                    ladder.append(node.parent_node)
                    node_set.add(node.parent_node)
                    go_up_ladder(node.parent_node, node_set, ladder, ladder_list)
                else:
                    ladder_list.append(ladder)
                    return
            else:
                ladder_list.append(ladder)
                return
        else:
            if node.sibling_nodes()[0].taxon:
                if node.parent_node not in node_set:
                    ladder.append(node.parent_node)
                    go_up_ladder(node.parent_node,node_set, ladder, ladder_list)
                    node_set.add(node.parent_node)
                else:
                    ladder_list.append(ladder)
                    return
            else:
                ladder_list.append(ladder)
                return
    else:
        print('here')
        ladder_list.append(ladder)
        return

In [351]:
node_set = set()
ladder_list = []
for leaf in tree.leaf_iter():
    go_up_ladder(leaf, node_set, [], ladder_list)

  This is separate from the ipykernel package so we can avoid doing imports until


In [241]:
print(max([len(i) for i in ladder_list])/len(keep))


0.014018691588785047


In [352]:
ladders = [i for i in ladder_list if len(i) > 0]
print(len(ladders))

70


In [235]:
in_ladders = []
for lst in ladder_list:
    for node in lst:
        if not node.taxon:
            in_ladders.append(node)
print(len(in_ladders))
all_internals = []
for i in tree.preorder_node_iter():
    if not i.taxon:
        all_internals.append(i)
print(len(all_internals))
print(len(in_ladders)/len(all_internals))

96
213
0.4507042253521127


## LTT set

In [360]:
full_len = 0.3859675963699254


In [361]:
edge_dict = {}
all_starts = set()

for edge in tree.preorder_edge_iter():
    if edge.tail_node:
        end = full_len - (edge.tail_node.distance_from_root())
        start = full_len - (edge.head_node.distance_from_root())
        
        edge_dict[edge] = (start,end)
        
        all_starts.add(start)
        all_starts.add(end) #needed for the heavy sampling bit at the end
        
all_starts = sorted(all_starts)

In [362]:
bins = []
for count,i in enumerate(all_starts):
    try:
        end = all_starts[count+1]
        bins.append((i,end))
    except IndexError:
        pass
    

In [363]:
bin_dict = {}

for bin_pair in bins:
    bin_dict[bin_pair] = 0
    for edge, timings in edge_dict.items():
        start = timings[0]
        end = timings[1]
        if start <= bin_pair[0] and end > bin_pair[0]:
            bin_dict[bin_pair] += 1
    

In [401]:
t_max_L = max(bin_dict.items(), key = lambda k : k[1])[0][0]
max_L = max(bin_dict.items(), key = lambda k : k[1])[1]

print(max_L)
print(t_max_L)

95
0.21369863013683843


In [367]:
## slopes

peak_y = 95
peak_x = 0.21369863013683843

start_y = 2
start_x = 0

end_y = 2
end_x = 0.3859675963699254

In [368]:
slope_1 = (peak_y-start_y)/(peak_x-start_x)
slope_2 = (peak_y-end_y)/(end_x-peak_x)

print(slope_1)
print(slope_2)
print(slope_1/slope_2)

435.19230769260884
539.8534746773089
0.8061304189117982


In [369]:
sampling_times = []
branching_times = []
for node in tree.preorder_node_iter():
    if node.taxon:
        sampling_times.append(node.distance_from_root())
    else:
        branching_times.append(node.distance_from_root())

sampling_times = sorted(sampling_times)
branching_times = sorted(branching_times)

In [370]:
sampling_diffs = []
for count,i in enumerate(sampling_times):
    try:
        next_one = sampling_times[count + 1]
        sampling_diffs.append(next_one - i)
    except IndexError:
        pass

branching_diffs = []
for count,i in enumerate(branching_times):
    try:
        next_one = branching_times[count + 1]
        branching_diffs.append(next_one - i)
    except IndexError:
        pass
    
print(np.mean(sampling_diffs))
print(np.mean(branching_diffs))

0.0012605312238711675
0.0016789798767443802


In [376]:
bin_size = full_len/20

new_bin = 0
new_bin_list = []
new_tup_list = []
for i in range(20):
    new_bin_list.append(new_bin)
    new_tup_list.append((new_bin, new_bin+bin_size))

    new_bin += bin_size
    
    
    
print(new_bin_list)

[0, 0.019298379818496268, 0.038596759636992536, 0.057895139455488805, 0.07719351927398507, 0.09649189909248135, 0.11579027891097762, 0.1350886587294739, 0.15438703854797017, 0.17368541836646645, 0.19298379818496272, 0.212282178003459, 0.23158055782195527, 0.25087893764045155, 0.2701773174589478, 0.28947569727744404, 0.3087740770959403, 0.32807245691443654, 0.3473708367329328, 0.36666921655142903]


In [394]:
new_lins = defaultdict(list)
average_lins = {}

for new_bin in new_tup_list:
    for old_bin in bins:
        start1 = old_bin[0]
        start2 = new_bin[0]
        end1 = old_bin[1]
        end2 = new_bin[1]
        
        if start1 >= start2 and end1 <= end2:
            frac = (end1-start1)/bin_size
        elif start1 >= start2 and end1 > end2 and start1 < end2:
            frac = (end2-start1)/bin_size
        elif start1 < start2 and end1 <= end2 and end1 > start2:
            frac = (end1-start2)/bin_size
        else:
            continue
        
        new_lins[new_bin].append(frac*bin_dict[old_bin])
                
for new_bin, totals in new_lins.items():
    average_lins[new_bin] = np.mean(totals)


In [372]:
new_bin_dict = {}

for bin_start in new_bin_list:
    new_bin_dict[bin_start] = 0
    for edge, timings in edge_dict.items():
        start = timings[0]
        end = timings[1]
        if start <= bin_start and end > bin_start:
            new_bin_dict[bin_start] += 1