In [1]:
import pandas as pd
import tskit as tsk
from IPython.display import Markdown, display
import numpy as np
import matplotlib.pyplot as plt
# from pathos.multiprocessing import ProcessingPool as Pool
from multiprocessing import Pool
from datetime import datetime
import json
import functools

%run -i "../isRecipMonophyletic.py"

# filename = "Cgale_87_dated.trees"
# filepath = "../../data/real"

filename = "cgal379.trees"
filepath = "../../data/real/"

startTime = datetime.now()

ts = tsk.load(filepath + filename)
ancestral_pop_id = -1 # -1 if no ancestral pop


In [2]:
# Define a list of predefined colors
predefined_colors = ["red", "blue", "green", "purple", "orange", "yellow", "brown", "pink"] # Maximum 8 populations...

# Initialize node_colours dictionary
node_colours = {}

for node_index, node in enumerate(ts.tables.nodes):
	if (node.flags & tsk.NODE_IS_SAMPLE) != 0: # If node is a sample
		
		# Assign color from predefined list, cycling through colors if necessary
		color = predefined_colors[node.population % len(predefined_colors)]
		node_colours[node_index] = color



In [3]:
pop_groups = [
    [0, 1],
    [2, 3]
]

pop_by_node = pd.DataFrame({
    "pop": [ts.tables.nodes[leaf].population for leaf in ts.samples()], # Get the population of each leaf
})

In [4]:
# Create standalone pandas DataFrame for easier manipulation of data than tskit's tables...

pd_sequence = pd.DataFrame(
    {
        'span': [tree.span for tree in ts.trees()],
        'bounds': [(tree.interval.left, tree.interval.right) for tree in ts.trees()],
        'treeObj' : ts.aslist()
    },
    index=[tree.index for tree in ts.trees()]
)

### Bootstrap approach

In [5]:
n_threads = 80
n_samples = 80

def f(i) : # Define util to be run in parallel
    np.random.seed(i + np.random.randint(0, 10000))
    bootstrap = pd_sequence.sample(n=int(len(pd_sequence) * 0.9), replace=True) # Use pandas sample method to take random trees in the sequence for monophyly test
    bootstrap['monophyletic'] = bootstrap.apply(lambda x: isRecipMonophyletic(x.treeObj, pop_by_node, ancestral_pop_id, pop_groups), axis=1) # Runs the test
    return bootstrap[bootstrap['monophyletic'] == True]['span'].sum() / bootstrap['span'].sum() # Gets the percentage of the sample trees that is monophyletic

pool = Pool(n_threads) # Create a pool of 2 workers
percentages = np.array(pool.map(f, range(n_samples))) * 100 # Run bootstrap twice in parallel

### Monophyly test for the output

In [6]:
# Runs the reciprocally monophyletic test on each tree and saves the boolean result in the monophyletic column

pd_sequence['monophyletic'] = pd_sequence.apply(lambda x: isRecipMonophyletic(ts.at_index(x.name), pop_by_node, ancestral_pop_id), axis=1)

TypeError: isRecipMonophyletic() missing 1 required positional argument: 'pop_groups'

### Export results

In [7]:
endTime = datetime.now()

In [8]:
# Define output file format

output = {
    "file": filename,
    "test": "reciprocal_monophyly",
    "description": "Reciprocal monophyly test on the tree sequence",
    "analysis_settings": {
        "ancestral_pop_id": ancestral_pop_id,
        "bootstrap_samples": n_samples,
    },
    "perf" : {
        "threads": n_threads,
        "start": startTime,
        "end": endTime,
        "duration": endTime - startTime
    },
    "result": {
        "percentage": percentages.mean(),
        "stddev": percentages.std(),
        "confidence_interval": {
            "bounds": [5, 95],
            "lower": np.percentile(percentages, 5),
            "upper": np.percentile(percentages, 95)
        },
        "raw": {
            "positive": (pd_sequence['monophyletic'] == True).sum(),
            "negative": (pd_sequence['monophyletic'] == False).sum(),
            "total_trees": len(pd_sequence),
            "details": pd_sequence[['bounds', 'span', 'monophyletic']].to_dict(orient='records')
        }
    }
}

KeyError: 'monophyletic'

In [None]:
json.dump(output, default=str, indent=4, fp=open(filepath + filename + ".recip_monophyletic.json", "w"))