In [1]:
import pandas as pd
import tskit as tsk
from IPython.display import Markdown, display
import numpy as np
import matplotlib.pyplot as plt
from multiprocessing import Pool

%run -i "../isRecipMonophyletic.py"

ts = tsk.load("../../data/sim/IM_m0_T1_chr1_1e4.trees")


In [2]:
pop_by_node = pd.DataFrame({
    "pop": [ts.tables.nodes[leaf].population for leaf in ts.samples()], # Get the population of each leaf
})

In [3]:
# Create standalone pandas DataFrame for easier manipulation of data than tskit's tables...

pd_sequence = pd.DataFrame(
    {
        'span': [tree.span for tree in ts.trees()],
        'bounds': [(tree.interval.left, tree.interval.right) for tree in ts.trees()],
        'treeObj' : ts.aslist()
    },
    index=[tree.index for tree in ts.trees()]
)

### Calculate one value

In [4]:
# Runs the reciprocally monophyletic test on each tree and saves the boolean result in the monophyletic column

pd_sequence['monophyletic'] = pd_sequence.apply(lambda x: isRecipMonophyletic(x.treeObj, pop_by_node), axis=1)

In [5]:
percentage = pd_sequence[pd_sequence['monophyletic'] == True]['span'].sum() / pd_sequence['span'].sum() # Calculate the percentage of the sequence that is monophyletic

# Fancy display of the result
display(Markdown("".join(("### Result : the sequence is monophyletic in **", str(percentage*100), " %** of the sequence."))))

### Result : the sequence is monophyletic in **12.4586 %** of the sequence.

### Bootstrap approach

In [6]:
n_threads = 2
n_samples = 10

def f(_) : # Define util to be run in parallel
    bootstrap = pd_sequence.sample(n=len(pd_sequence), replace=True) # Use pandas sample method to take random trees in the sequence for monophyly test
    bootstrap['monophyletic'] = bootstrap.apply(lambda x: isRecipMonophyletic(x.treeObj, pop_by_node), axis=1) # Runs the test
    return bootstrap[bootstrap['monophyletic'] == True]['span'].sum() / bootstrap['span'].sum() # Gets the percentage of the sample trees that is monophyletic

p = Pool(n_threads) # Create a pool of 2 workers
percentages = p.map(f, range(n_samples)) # Run bootstrap twice in parallel

In [9]:
percentages = np.array(percentages)

np.percentile(percentages, [5, 95])

array([0.11315771, 0.13052051])