# Treemerge test

Mamie Wang 2021/01/22


https://github.com/ekmolloy/trees-in-the-desert-tutorial

Example usage:

python ./python/treemerge.py \
    -s ../trees-in-the-desert-tutorial/data/nj-on-agid-tre.txt \
    -t ../trees-in-the-desert-tutorial/data/astral-subset-1-outof-4-tre.txt \
       ../trees-in-the-desert-tutorial/data/astral-subset-2-outof-4-tre.txt \
       ../trees-in-the-desert-tutorial/data/astral-subset-3-outof-4-tre.txt \
       ../trees-in-the-desert-tutorial/data/astral-subset-4-outof-4-tre.txt \
    -m ../trees-in-the-desert-tutorial/data/agid-mat.txt \
    -x ../trees-in-the-desert-tutorial/data/agid-mat.txt_taxlist \
    -o treemerge-on-astral-and-agid-tre.txt \
    -w . \
    -p ./paup4a168_centos64

In [11]:
## Modify the script to save individual subtrees as newick string txt
## tree_rec.write(path="output.tre", schema="newick")
## save the distance matrix in phylip format
import sys, os

sys.path.append("/gpfs/ysm/project/kleinstein/mw957/repos/spectral-tree-inference/spectraltree")

import generation
import reconstruct_tree
import time
import utils
import pandas as pd
import argparse


m = 300
kappa = 2
mutation_rate=0.05
threshold = 20
verbose = False

tree = utils.balanced_binary(256)

In [14]:
filename = '/gpfs/ysm/scratch60/morgan_levine/mw957/subtrees/true_tree.txt'
tree.write(path=filename, schema="newick")

In [None]:
observations, taxa_meta = generation.simulate_sequences(m, tree_model=tree, seq_model=generation.Jukes_Cantor(), mutation_rate=mutation_rate, alphabet="DNA")

spectral_method = reconstruct_tree.SpectralTreeReconstruction(reconstruct_tree.NeighborJoining, reconstruct_tree.JC_similarity_matrix)
    
tree_rec = spectral_method.deep_spectral_tree_reconstruction(observations, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = threshold, min_split = 5, verbose = verbose)

In [5]:
# 17 subtrees
filename = '/gpfs/ysm/scratch60/morgan_levine/mw957/subtrees/STDR_tree.txt'
tree_rec.write(path=filename, schema="newick")

In [2]:
RF,F1 = reconstruct_tree.compare_trees(tree_rec, tree)

print("RF = ",RF)
print("F1% = ",F1) 

RF =  6
F1% =  99.41176470588236


In [3]:
# 17 subtrees
filename = '/gpfs/ysm/scratch60/morgan_levine/mw957/subtrees/STDR_tree.txt'
tree_rec.write(path=filename, schema="newick")

In [4]:
# get the distance matrix

distance = reconstruct_tree.JC_distance_matrix(observations, taxa_meta)

distance_pd = pd.DataFrame(distance)

taxa_list = [x.label for x in taxa_meta]

In [5]:
#save as phylip format

with open('/gpfs/ysm/scratch60/morgan_levine/mw957/subtrees/taxa.txt', 'w') as f:
    for item in taxa_list:
        f.write("%s\n" % item)

In [6]:
distance_pd.index = taxa_list

In [7]:
distance_pd.to_csv("/gpfs/ysm/scratch60/morgan_levine/mw957/subtrees/JC_distance.txt", 
                   sep = "\t", header = False)

add a line for the tree size at the beginning of the distance_pd. Run treemerge:

In [None]:
python /gpfs/ysm/project/kleinstein/mw957/repos/treemerge/python/treemerge.py \
-s STDR_tree.txt \
-t subtree-09884766903202061374.txt \
subtree-21251064927194554684.txt \
subtree-24188037843447968671.txt \
subtree-28736709711893042751.txt \
subtree-39830484667598160137.txt \
subtree-44477749596981731588.txt \
subtree-44796723752216619513.txt \
subtree-48652436967575695961.txt \
subtree-51038809888865761993.txt \
subtree-51796522975420271178.txt \
subtree-57517081323465643407.txt \
subtree-58139183252633199315.txt \
subtree-65427439841688407262.txt \
subtree-67521679634100653874.txt \
subtree-70380690168101102136.txt \
subtree-70702737740724630041.txt \
subtree-88510064808045309115.txt \
-m JC_distance.txt \
-x taxa.txt \
-o treemerge-on-STDR-subtree.txt \
-w . \
-p /gpfs/ysm/project/kleinstein/mw957/repos/treemerge/paup4a168_centos64 

In [9]:
# read in the tree merged based on STDR trees
import dendropy

In [25]:
tree_true = dendropy.Tree.get(path = "/gpfs/ysm/scratch60/morgan_levine/mw957/subtrees/true_tree.txt",
        schema="newick")

tree_new = dendropy.Tree.get(path = "/home/mw957/scratch60/subtrees/treemerge-on-STDR-subtree.txt", 
                             schema="newick",
                             taxon_namespace=tree_true.taxon_namespace)

In [26]:
RF,F1 = reconstruct_tree.compare_trees(tree_new, tree_true)

print("RF = ",RF)
print("F1% = ",F1) 

RF =  28
F1% =  97.25490196078431
