In [36]:
import pandas as pd
import tskit as tsk
from IPython.display import Markdown, display
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool

%run -i "../isRecipMonophyletic.py"

ts = tsk.load("../../data/real/cgal379.trees")
ancestral_pop_id = -1 # -1 if no ancestral pop


In [37]:
# Define a list of predefined colors
predefined_colors = ["red", "blue", "green", "purple", "orange", "yellow", "brown", "pink"] # Maximum 8 populations...

# Initialize node_colours dictionary
node_colours = {}

for node_index, node in enumerate(ts.tables.nodes):
	if (node.flags & tsk.NODE_IS_SAMPLE) != 0: # If node is a sample
		
		# Assign color from predefined list, cycling through colors if necessary
		color = predefined_colors[node.population % len(predefined_colors)]
		node_colours[node_index] = color



In [38]:
pop_by_node = pd.DataFrame({
    "pop": [ts.tables.nodes[leaf].population for leaf in ts.samples()], # Get the population of each leaf
})

In [39]:
# Create standalone pandas DataFrame for easier manipulation of data than tskit's tables...

pd_sequence = pd.DataFrame(
    {
        'span': [tree.span for tree in ts.trees()],
        'bounds': [(tree.interval.left, tree.interval.right) for tree in ts.trees()],
        'treeObj' : ts.aslist()
    },
    index=[tree.index for tree in ts.trees()]
)

### Calculate one value

In [19]:
# Runs the reciprocally monophyletic test on each tree and saves the boolean result in the monophyletic column

pd_sequence['monophyletic'] = pd_sequence.apply(lambda x: isRecipMonophyletic(x.treeObj, pop_by_node, ancestral_pop_id), axis=1)

In [16]:
percentage = pd_sequence[pd_sequence['monophyletic'] == True]['span'].sum() / pd_sequence['span'].sum() # Calculate the percentage of the sequence that is monophyletic

# Fancy display of the result
display(Markdown("".join(("### Result : the sequence is monophyletic in **", str(percentage*100), " %** of the sequence."))))

### Result : the sequence is monophyletic in **0.0 %** of the sequence.

### Bootstrap approach

In [41]:
n_threads = 2
n_samples = 10

def f(_) : # Define util to be run in parallel
    bootstrap = pd_sequence.sample(n=len(pd_sequence), replace=True) # Use pandas sample method to take random trees in the sequence for monophyly test
    bootstrap['monophyletic'] = bootstrap.apply(lambda x: isRecipMonophyletic(x.treeObj, pop_by_node, ancestral_pop_id), axis=1) # Runs the test
    return bootstrap[bootstrap['monophyletic'] == True]['span'].sum() / bootstrap['span'].sum() # Gets the percentage of the sample trees that is monophyletic

p = Pool(n_threads) # Create a pool of 2 workers
percentages = p.map(f, range(n_samples)) # Run bootstrap twice in parallel

In [42]:
percentages = np.array(percentages)

np.percentile(percentages, [5, 95])

array([0., 0.])

### Export results