# Ancestral Sequences and the Phylogenetic Tree
Program to prune the phylogenetic tree file of all species to only those modern species that would be used for ASR. Looks at the nodes the ancestral sequences will occupy to keep a track of them. 

Milo Thordarson (anth2886@student.uu.se)

## Setup

In [87]:
from ete3 import Tree
from Bio import SeqIO
import pandas

# Variables that will need changing
# We need the SNP file, as it has the annotations of which sequences are Ancestral
tree_file_name = "treefile_pette"
snp_file_name = "snpAlignment_Spyrou2022_modern_ancient.fasta2"
outgroup = "outgroup_Y.pseudo"

## Defining Functions
### Obaining a list of the ancient species
NOTE: this will be highly dependent on the kind of data and the naming scheme, hence why for example there is a hard coded naming change in this function, so be sure to **change this function to suit the needs of the current dataset.**

In [85]:
def get_ancient_species_list(name_file, tree):
    # Now get ids from snp file that are ancient, removing the last 8 characters that are the tag
    ancient_species_list = [x.id[:-8] for x in name_file if "Ancient" in x.id]
    print(f'Number of ancient samples to start: {len(ancient_species_list)}')

    # Get names of all species from the treefile
    tree_names = list(tree.iter_leaf_names())
    
    # For loop to go through and make sure that the ancient names from the snp file match those in the tree exactly
    for i in range(len(ancient_species_list)):
        if ancient_species_list[i] not in tree_names:
            # Finding the name as the prefix matches
            name = next((name for name in tree_names if ancient_species_list[i] in name), None)
            if name is not None:
                ancient_species_list[i] = name
            # Well, the prefix matches, except for this known exception, so I am doing it manually
            elif ancient_species_list[i] == "COL1":
                ancient_species_list[i] = "COLC1-COLC2a_COLC2b"
    return ancient_species_list

### Create informative names for the tree nodes

In [86]:
def informative_nodes(tree):
  # This is just a small code to name all of the interal nodes something informative. 
  count = 0
  for node in tree.traverse():
    if not node.is_leaf():
      node.name = "intrnl" + str(count)
      count += 1
  return tree

### Annotation of position of ancient nodes, distances from the ancient samples to those nodes and issues that make reconstruction hard
The interpretation of the ASR sequences will be different if the ancient sequence we are looking at is a direct sister group to any taxa: that means that it is not an ancestor at a node, but along the evolution of the mdoern branch and therefore will not be reconstructed. But if the sister taxa is another ancient sequence, then ASR will reconstruct the ancestor to both those sequences and the modern, meaning that interpretation is harder. 

In [93]:
def create_annotation_dataframe(ancient_sample_names, tree):
    # Creating the pandas dataframe that will be the output
    d = {'name': ancient_sample_names}
    df = pandas.DataFrame(data = d)

    # List to keep a track of the branch length of each ancient sample
    distances = []
    # Seting up a count for the number of times an issue is found and a list so they can be coded into the dataframe
    ancient_sisters = []
    sisters = []
    # Also keeps a track of the parent nodes for the dataframe
    parents = []

    for node_name in ancient_sample_names:
        distances.append(tree.search_nodes(name=node_name)[0].dist)
        # Set up check for problems, and store parent names
        ancient_problem = False
        sister_problem = False  
        node = tree.search_nodes(name=node_name)[0]
        parent = node.up
        for child in parent.get_children():
            # Make sure you don't count the ancient sample we're already looking at
            if child.is_leaf() and child.name != node_name: 
                parents.append(parent.name)
                if child.name in ancient_sample_names: 
                    ancient_problem = True
                    ancient_sisters.append(child.name)
                else:
                    sister_problem = True
                    sisters.append(child.name)
            elif child.name != node_name: parents.append(child.name)
        # If no problems were found, code that with a dash
        if ancient_problem == False: ancient_sisters.append("-")
        if sister_problem == False: sisters.append("-")

    df['dist'] = distances
    df['parent'] = parents
    df['anct_problem'] = ancient_sisters
    df['sis_problem'] = sisters
    
    return df

def annotate_tree():
    
    return

### Pruning the tree of ancient samples

In [68]:
def prune_ancient_samples(ancient_sample_names, tree):
    tree_names = list(tree.iter_leaf_names())
    # Prune the tree, agrument taken is species to retain
    tree.prune([x for x in tree_names if x not in ancient_sample_names], preserve_branch_length=True)
    pruned_tree = tree
    return pruned_tree

## Running the code

In [94]:
# First, read in the treefile as a Tree in ete3 and snp file as a list from a fasta
full_tree = Tree(tree_file_name)
snp_fasta = list(SeqIO.parse(open(snp_file_name), 'fasta'))

# Then root the tree using the outgroup
full_tree.set_outgroup(outgroup)

print(f'Number of leaves in tree file: {len(full_tree)}')
print(f'Number of leaves in snp file: {len(snp_fasta)}')

ancient = get_ancient_species_list(snp_fasta, full_tree)

# Double check that we are left with a list of the same length of ancient samples, and that all the names are found in the treefile
print(f'Number of ancient samples: {len(ancient)}, all ancient names found in tree names: {all(x in list(full_tree.iter_leaf_names()) for x in ancient)}')

# Get informative names of the nodes
full_tree_node_names = informative_nodes(full_tree)

# Displaying it sorted by the distance, so we can see which ancient sequences are closest to being the real ancestral sequence. 
ancient_df = create_annotation_dataframe(ancient, full_tree_node_names)
display(ancient_df.sort_values(by=['dist'], ascending=True))

pruned_tree = prune_ancient_samples(ancient, full_tree_node_names)

# Then create a pruned tree file in newick format
pruned_tree.write(format=1, outfile="pruned_tree_outgroup_dist.treefile")

# remove the counting from the for loops, do it here. 
#print(f'Number of ancient sister problems = {ancient_problem_count}')
#print(f'Number of sister problems = {sister_problem_count}')
#print(f'Total problematic ancient samples = {ancient_problem_count + sister_problem_count}')
#print(f'Total useful ancient samples = {len(ancient) - (ancient_problem_count + sister_problem_count)}')

Number of leaves in tree file: 251
Number of leaves in snp file: 251
Number of ancient samples to start: 47
Number of ancient samples: 47, all ancient names found in tree names: True


Unnamed: 0,name,dist,parent,anct_problem,sis_problem
42,OBS124,2e-06,intrnl246,OBS116,-
0,BSK001-003.A0101.A0102.A0103-malt,2e-06,intrnl78,-,-
39,London_EastSmithfield_8124_8291_11972,2e-06,intrnl145,-,-
36,STN019.A0101,2e-06,intrnl237,-,-
35,STN008.A0101,2e-06,intrnl228,-,-
34,STN021.A0101,2e-06,intrnl247,STN013.A0101,-
33,STN020.A0101,2e-06,intrnl247,-,-
32,STN014.A0101,2e-06,intrnl213,-,-
31,STN013.A0101,2e-06,intrnl247,STN021.A0101,-
30,STN007.A0101,2e-06,intrnl220,-,-
