In [None]:
import importlib
import time
import matplotlib as mpl
from matplotlib import pyplot as plt
import pandas as pd
from pathlib import Path

import ete3
import Bio.Phylo

import importlib

import src.analysis_utils as au
import src.load_functions as lf
import src.distance_matrix_experiment as dme
import src.fransua_plot_eteTree.plot_eteTree
from src.fransua_plot_eteTree.plot_eteTree import plot_tree as plot_ete3_tree

import rpy2
import rpy2.robjects
import rpy2.robjects.packages

The tree distance computations are using an `R` package via `rpy2`. The first time it is necessary to install the `R` oackage `TreeDist`---see "**Install R packages**" below

In [None]:
treedist = rpy2.robjects.packages.importr('TreeDist')
ape = rpy2.robjects.packages.importr('ape')

# Install R packages

Only needed to run if `TreeDist` is not yet installed in R.

In [None]:
# check whether TreeDist is installed
rpy2.robjects.packages.isinstalled('TreeDist')

In [None]:
r_utils = rpy2.robjects.packages.importr('utils')
r_utils.chooseBioCmirror(ind=6)
r_utils.install_packages('TreeDist')

# Load resources

In [None]:
res = lf.ResourcesManager()
ethnologue_tree = res.ethnologue_tree
languages_all = res.indoeuropean_languages_with_data
wiki_articles_df = res.wiki_articles_df

# Print trees

Scripts to print the Ethnologue tree and the UPGMA/NJ trees for given parameters

In [None]:
# plot Ethnologue trees with `num` languages with the most articles
num = 50
ethnologue_tree_ete3 = res.get_ethnologue_with_most_articles(num)
plt.figure(figsize=(8,16))
plot_ete3_tree(ethnologue_tree_ete3, axe=plt.gca(),
               label_func=lambda x: f'{ethnologue_tree.codes_to_names.get(x, x)} ({x})')
plt.title(f"Ethnologue tree of the top {num} languages w.r.t. # of wiki articles (>= {wiki_articles_df[wiki_articles_df['Wiki'].isin(res.get_languages_with_most_articles(num))]['Articles'].min()})")
plt.show()

In [None]:
# reconstruct and plot trees with given parameters
experiment_name = 'ethnologue_10k_d2'
mats_folder = f'data/pd-distance-matrices/{experiment_name}'
n = 10000
maxdim = 2
number_of_languages = 50

for metric in ('cosine', ):  # ('euclidean', 'cosine'):
    for dim in (1, ):  # (0,1,2):
        for distance in ('bars_statistics', ):  # ('bottleneck', 'sliced_wasserstein', 'persistence_image', 'bars_statistics'):
            filename = f'{mats_folder}/pddmat.{experiment_name}.n{n}.{metric}.{distance}.d{maxdim}.txt'
            distance_matrix, languages = lf.load_strictly_lower_triangular_matrix(
                filename,
                restrict_labels= res.get_languages_with_most_articles(number_of_languages)
            )

            trees = {algorithm : au.build_tree_from_labeled_matrix(distance_matrix, languages, algorithm) for algorithm in ['nj', 'upgma']}

            fig, axs = plt.subplots(1,2, figsize=(14,14))
            label_func=lambda x: ethnologue_tree.codes_to_names.get(x, x)
            distance_str=distance.replace("_",r"\ ")
            plt.suptitle('$\\bf{' f'{metric}' '}$' ' metric, dimension ' '$\\bf{' f'{dim}' '}$' ', PD-distance: ' '$\\bf{' f'{distance_str}' '}$')
            for i, (tree_type, tree) in enumerate(sorted(trees.items())):
                ax = axs[i]
                plot_ete3_tree(tree, axe=ax, label_func=label_func)
                ax.set_title(f'tree type: ' r'$\bf{' f'{tree_type}' '}$')
#             plt.savefig(f'{mats_folder}/trees/tree.{experiment_name}.n{n}.{metric}.{distance}.d{dim}.pdf', bbox_inches='tight')
            plt.show()

# Generate reconstructed tree distances data

In [None]:
data_folder = Path('data/pd-distance-matrices/ethnologue_10k_d2/')
output_folder = Path(f'data/tree-distances/{experiment_name}/')
embedding_dim = 300
embedding_num_of_words = 10000
filename_prefix = f'pddmat.ethnologue_10k_d2.n{embedding_num_of_words}'

#=========================================================================

table_list_of_dict = []
tree_distance_calculator = au.TreeDistanceCalculator(treedist_r_package=treedist, ape_r_package=ape)
time_start = time.perf_counter()
for num in (81, 50, 30):
    languages_considered = res.get_languages_with_most_articles(num)
    ethnologue_tree_ete3 = ete3.Tree(ethnologue_tree.get_newick_tree(languages_considered))
    ethnologue_tree_r = tree_distance_calculator.convert_tree_ete_to_Rphylo(ethnologue_tree_ete3)
    for embedding_metric in ['euclidean', 'cosine']:
        for persistent_diagram_metric in ['bars_statistics', 'bottleneck', 'persistence_image', 'sliced_wasserstein']:
            for persistent_diagram_dim in [0, 1, 2]:
                filepath = data_folder / f'{filename_prefix}.{embedding_metric}.{persistent_diagram_metric}.d{persistent_diagram_dim}.txt'
                matrix, languages = lf.load_strictly_lower_triangular_matrix(filepath, restrict_labels=languages_considered)
                for tree_algorithm in ['nj', 'upgma']:
                    metadata = {
                        'embedding_dim': embedding_dim,
                        'embedding_num_of_words': embedding_num_of_words,
                        'embedding_metric': embedding_metric,
                        'persistent_diagram_metric': persistent_diagram_metric,
                        'persistent_diagram_dim': persistent_diagram_dim,
                        'number_of_languages': num,
                        'tree_algorithm': tree_algorithm
                    }
                    
                    tree = au.build_tree_from_labeled_matrix(matrix, languages, tree_algorithm)
                    distance_data = tree_distance_calculator.compute_distances(ethnologue_tree_r, tree)
                    table_list_of_dict.append(metadata | distance_data)
        
df_tree_comparisons = pd.DataFrame(table_list_of_dict)
time_taken = time.perf_counter() - time_start
print(f'Time to compute: {time_taken: .2f} s')

df_tree_comparisons.to_csv(output_folder / 'tree_distances_df.csv')

# Generate permutation test data

For each combination of parameters, compute the distances for `number_of_permutations` random permutations of leaves, and save it as a data frame into a csv file.

**CREATE THE NECESSARY OUTPUT DIRECTORIES BEFORE RUNNING**, e.g. `data/random_permutations_data_frames/ethnologue_10k_d2/`

In [None]:
number_of_permutations = 10000  # on a laptop, 10'000 permutations take around 30 s for one combination of parameters, under an hour for all 72

experiment_name='ethnologue_10k_d2'
data_folder = Path(f'data/pd-distance-matrices/{experiment_name}/')
embedding_dim = 300
embedding_num_of_words = 10000
filename_prefix = f'pddmat.ethnologue_10k_d2.n{embedding_num_of_words}'
output_folder = Path(f'data/random_permutations_data_frames/{experiment_name}/')
output_prefix = f'permutations.ethnologue_10k_d2'
output_sufix = f'len{number_of_permutations}.csv'

#=========================================================================

list_of_dict = []
tree_distance_calculator = au.TreeDistanceCalculator(treedist_r_package=treedist, ape_r_package=ape)
time_start = time.perf_counter()
for num in (30, 50, 81):
    table_for_num = {}
    languages_considered = res.get_languages_with_most_articles(num)
    ethnologue_tree_ete3 = ete3.Tree(ethnologue_tree.get_newick_tree(languages_considered))
    ethnologue_tree_r = tree_distance_calculator.convert_tree_ete_to_Rphylo(ethnologue_tree_ete3)
    for embedding_metric in ['euclidean', 'cosine']:
        for persistent_diagram_metric in ['bars_statistics', 'bottleneck', 'persistence_image', 'sliced_wasserstein']:
            for persistent_diagram_dim in [0, 1, 2]:
                filepath = data_folder / f'{filename_prefix}.{embedding_metric}.{persistent_diagram_metric}.d{persistent_diagram_dim}.txt'
                matrix, languages = lf.load_strictly_lower_triangular_matrix(filepath, restrict_labels=languages_considered)
                
                for tree_algorithm in ['nj', 'upgma']:
                    tree = au.build_tree_from_labeled_matrix(matrix, languages, tree_algorithm)
                    distance_data = tree_distance_calculator.compute_distances(ethnologue_tree_r, tree)
                    
                    time_start_local = time.perf_counter()
                    permutations_table = []
                    for _ in range(number_of_permutations):
                        au.ete3_permute_leafs(tree)
                        distance_data_perm = tree_distance_calculator.compute_distances(ethnologue_tree_r, tree)
                        permutations_table.append(distance_data_perm)
                    permutations_df = pd.DataFrame(permutations_table)
                    output_filename = output_folder / (f'{output_prefix}.langs{num}.{embedding_metric}.'
                                                       f'{persistent_diagram_metric}.d{persistent_diagram_dim}.'
                                                       f'{tree_algorithm}.{output_sufix}')
                    permutations_df.to_csv(output_filename)
                    
                    time_taken_local = time.perf_counter() - time_start_local
                    print(f'{output_filename}    [{time_taken_local: .2f} s]')
            
time_taken = time.perf_counter() - time_start
print(f'Time to compute: {time_taken: .2f} s')