# Writing out annotations for the tree, for viewing on iTOL
For processing annotations, it is simplest to use the tree with ids as labels, and then re-labelling later.



In [22]:
from opentree import OT, taxonomy_helpers, util

In [23]:
aves = OT.get_ottid_from_name('Aves')

## Download the Open Tree of Life Taxonomy (OTT)

You can download the OTT by going to https://tree.opentreeoflife.org/about/taxonomy-version/ott3.2
or by running the following command.

Set the `loc` argument to wherever you wany to store the taxonomy files.

In [24]:
taxonomy_helpers.download_taxonomy_file(version = 3.2, loc = '../..')

Taxonomy already available at ../../ott3.2

'../../ott3.2'

In [25]:
bird_families = taxonomy_helpers.get_ott_ids_group_and_rank(group_ott_id=aves, 
                                                            rank='family', 
                                                            taxonomy_file='../../ott3.2/taxonomy.tsv')

Gathering ott ids from group with ott id 81461.


In [26]:
ret = taxonomy_helpers.labelled_induced_synth(ott_ids = bird_families, label_format="name", inc_unlabelled_mrca=True)
ret['labelled_tree'].write(path="labelled_bird_families.tre", schema="newick")


In [27]:
synth_tips = [leaf.taxon.label for leaf in ret['original_tree'].leaf_node_iter()]

In [28]:
synth_tips.sort()
len(synth_tips[130:])

20

In [29]:
# Get the label of every node in the output tree
node_annotation = {}
for node in ret['original_tree']:
    if node.label:
        node_annotation[node.label] = {}
    elif node.taxon:
        if node.taxon.label:
            node_annotation[node.taxon.label] = {}
    else:
        print(node)

In [30]:
for node_label in node_annotation:
    node_annotation[node_label] = {}
    node_annotation[node_label]['families'] = []
    node_annotation[node_label]['studies'] = []
    node_annotation[node_label]['strict_support'] = []
    node_annotation[node_label]['support'] = []
    node_annotation[node_label]['conflict'] = []
    node_annotation[node_label]['total_descendents'] = 0

In [31]:
nid_resp = dict()
nids = [label.split()[-1] for label in node_annotation.keys()]
resp = OT.synth_node_info(nids).response_dict
for node_info in resp:
    nid_resp[node_info['node_id']] = node_info

In [32]:
i = 0
for node in node_annotation:
    i+=1
    nid = node.split()[-1]
    supporting = nid_resp[nid].get('source_id_map')
    strict_support = nid_resp[nid].get('supported_by', {})
    ppo = nid_resp[nid].get('partial_path_of', {})
    conflict = nid_resp[nid].get('conflicts_with', [])
    if supporting.keys() == set(['ott3.2draft9']):
        node_annotation[node]['studies'] = 0
    else:
        node_annotation[node]['studies'] = len(supporting.keys())
    if strict_support.keys() == set(['ott3.2draft9']):
        node_annotation[node]['strict_support'] = 0
    else:
        node_annotation[node]['strict_support'] = len(strict_support.keys())
    gen_support = set(list(strict_support.keys()) + list(ppo.keys()))
    if 'ott3.2draft9' in gen_support:
        gen_support.remove('ott3.2draft9')
    node_annotation[node]['support'] = len(gen_support)
    node_annotation[node]['conflict'] = len(conflict)


In [33]:
total_descendents = 0
desc = []
for tip in synth_tips:
    nid = tip.split()[-1]
    total_descendents += nid_resp[nid]['num_tips']
    node_annotation[tip]['total_descendents'] = int(nid_resp[nid]['num_tips'])
    desc.append(int(nid_resp[nid]['num_tips']))

print(total_descendents)

10357


In [34]:
max_conf=0
max_support=0
support = []
for node in node_annotation:
    support.append(node_annotation[node]['support'])
    if node_annotation[node]['conflict'] > max_conf:
        max_conf = node_annotation[node]['conflict']
    if node_annotation[node]['support'] > max_support:
        max_support = node_annotation[node]['support']


        

print(max_conf)
print(max_support)


17
20


In [35]:
fi = open("conflict_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("conflict")
fi.write(startstr)
for node in node_annotation:
    node_label = ret['label_map'][node].replace(' ','_')
    relconf = node_annotation[node]['conflict']/10
    r = 255*relconf
    g = 0
    b = 0
    color = "rgba({}, {}, {}, {})".format(r, g, b, 0.5+relconf)
    fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(node_label, color))
        
fi.close()

In [36]:
fi = open("support_anno.txt", 'w')
startstr = """DATASET_STYLE
SEPARATOR TAB

#label is used in the legend table (can be changed later)
DATASET_LABEL\t{}

#dataset color (can be changed later)
COLOR\t#ffff00

DATA\n""".format("Support")
fi.write(startstr)
for node in node_annotation:
    node_label = ret['label_map'][node].replace(' ','_')
    if node_annotation[node]['support']:
        relsupport = node_annotation[node]['support']/20
        r = 0
        g = 255*relsupport
        b = 0
        color = "rgba({}, {}, {}, {})".format(r, g, b, 0.5+relsupport)
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(node_label,color))
    else:
        color = "rgba(0, 0, 0, 0.15)"
        fi.write("{}\tbranch\tclade\t{}\t1\tnormal\n".format(node_label,color))
fi.close()

In [37]:
fi = open("log_descendent_heatmap.txt", 'w')
import math
startstr = """DATASET_HEATMAP
#In heatmaps, each ID is associated to multiple numeric values, which are displayed as a set of colored boxes defined by a color gradient
#lines starting with a hash are comments and ignored during parsing
#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
#SEPARATOR TAB
SEPARATOR SPACE
#SEPARATOR COMMA

#label is used in the legend table (can be changed later)
DATASET_LABEL LOG_number_of_descendante

#dataset color (can be changed later)
COLOR #ff0000

#define labels for each individual field column
FIELD_LABELS number_desc

#=================================================================#
#                    OPTIONAL SETTINGS                            #
#=================================================================#


#Heatmaps can have an optional Newick formatted tree assigned. Its leaf IDs must exactly match the dataset FIELD_LABELS.
#The tree will be used to sort the dataset fields, and will be displayed above the dataset. It can have branch lengths defined.
#All newlines and spaces should be stripped from the tree, and COMMA cannot be used as the dataset separator if a FIELD_TREE is provided.
#FIELD_TREE (((f1:0.2,f5:0.5):1,(f2:0.2,f3:0.3):1.2):0.5,(f4:0.1,f6:0.5):0.8):1;



#=================================================================#
#     all other optional settings can be set or changed later     #
#           in the web interface (under 'Datasets' tab)           #
#=================================================================#

#Each dataset can have a legend, which is defined using LEGEND_XXX fields below
#For each row in the legend, there should be one shape, color and label.
#Optionally, you can define an exact legend position using LEGEND_POSITION_X and LEGEND_POSITION_Y. To use automatic legend positioning, do NOT define these values
#Optionally, shape scaling can be present (LEGEND_SHAPE_SCALES). For each shape, you can define a scaling factor between 0 and 1.
#Shape should be a number between 1 and 6, or any protein domain shape definition.
#1: square
#2: circle
#3: star
#4: right pointing triangle
#5: left pointing triangle
#6: checkmark

#LEGEND_TITLE,Dataset legend
#LEGEND_POSITION_X,100
#LEGEND_POSITION_Y,100
#LEGEND_SHAPES,1,2,3
#LEGEND_COLORS,#ff0000,#00ff00,#0000ff
#LEGEND_LABELS,value1,value2,value3
#LEGEND_SHAPE_SCALES,1,1,0.5

#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.
#MARGIN 0

#width of the individual boxes
#STRIP_WIDTH 25

#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.
#SHOW_INTERNAL 0


#show dashed lines between leaf labels and the dataset
#DASHED_LINES 1

#if a FIELD_TREE is present, it can be hidden by setting this option to 0
#SHOW_TREE 1

#define the color for the NULL values in the dataset. Use the letter X in the data to define the NULL values
#COLOR_NAN #000000

#automatically create and display a legend based on the color gradients and values defined below
#AUTO_LEGEND 1


#define the heatmap gradient colors. Values in the dataset will be mapped onto the corresponding color gradient.
COLOR_MIN #0000ff
COLOR_MAX #ff0000

#you can specify a gradient with three colors (e.g red to yellow to green) by setting 'USE_MID_COLOR' to 1, and specifying the midpoint color
#USE_MID_COLOR 1
#COLOR_MID #ffff00

#By default, color gradients will be calculated based on dataset values. You can force different values to use in the calculation by setting the values below:
#USER_MIN_VALUE 0
#USER_MID_VALUE 500
#USER_MAX_VALUE 1000

#border width; if set above 0, a border of specified width (in pixels) will be drawn around individual cells
#BORDER_WIDTH,0

#border color; used only when BORDER_WIDTH is above 0
#BORDER_COLOR,#0000ff


#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA\n
"""
fi.write(startstr)
for node in synth_tips:
    node_label = ret['label_map'][node].replace(' ','_')
    desc = int(node_annotation[node]['total_descendents'])
    if desc == 0:
        val = -0.1
    else:
        val = math.log(desc, 10)
    fi.write("{} {}\n".format(node_label, val))
        
fi.close()

In [38]:
fi = open("descendent_heatmap.txt", 'w')
import math
startstr = """DATASET_HEATMAP
#In heatmaps, each ID is associated to multiple numeric values, which are displayed as a set of colored boxes defined by a color gradient
#lines starting with a hash are comments and ignored during parsing
#=================================================================#
#                    MANDATORY SETTINGS                           #
#=================================================================#
#select the separator which is used to delimit the data below (TAB,SPACE or COMMA).This separator must be used throughout this file (except in the SEPARATOR line, which uses space).
#SEPARATOR TAB
SEPARATOR SPACE
#SEPARATOR COMMA

#label is used in the legend table (can be changed later)
DATASET_LABEL number_of_descendants

#dataset color (can be changed later)
COLOR #ff0000

#define labels for each individual field column
FIELD_LABELS number_desc

#=================================================================#
#                    OPTIONAL SETTINGS                            #
#=================================================================#


#Heatmaps can have an optional Newick formatted tree assigned. Its leaf IDs must exactly match the dataset FIELD_LABELS.
#The tree will be used to sort the dataset fields, and will be displayed above the dataset. It can have branch lengths defined.
#All newlines and spaces should be stripped from the tree, and COMMA cannot be used as the dataset separator if a FIELD_TREE is provided.
#FIELD_TREE (((f1:0.2,f5:0.5):1,(f2:0.2,f3:0.3):1.2):0.5,(f4:0.1,f6:0.5):0.8):1;



#=================================================================#
#     all other optional settings can be set or changed later     #
#           in the web interface (under 'Datasets' tab)           #
#=================================================================#

#Each dataset can have a legend, which is defined using LEGEND_XXX fields below
#For each row in the legend, there should be one shape, color and label.
#Optionally, you can define an exact legend position using LEGEND_POSITION_X and LEGEND_POSITION_Y. To use automatic legend positioning, do NOT define these values
#Optionally, shape scaling can be present (LEGEND_SHAPE_SCALES). For each shape, you can define a scaling factor between 0 and 1.
#Shape should be a number between 1 and 6, or any protein domain shape definition.
#1: square
#2: circle
#3: star
#4: right pointing triangle
#5: left pointing triangle
#6: checkmark

#LEGEND_TITLE,Dataset legend
#LEGEND_POSITION_X,100
#LEGEND_POSITION_Y,100
#LEGEND_SHAPES,1,2,3
#LEGEND_COLORS,#ff0000,#00ff00,#0000ff
#LEGEND_LABELS,value1,value2,value3
#LEGEND_SHAPE_SCALES,1,1,0.5

#left margin, used to increase/decrease the spacing to the next dataset. Can be negative, causing datasets to overlap.
#MARGIN 0

#width of the individual boxes
#STRIP_WIDTH 25

#always show internal values; if set, values associated to internal nodes will be displayed even if these nodes are not collapsed. It could cause overlapping in the dataset display.
#SHOW_INTERNAL 0


#show dashed lines between leaf labels and the dataset
#DASHED_LINES 1

#if a FIELD_TREE is present, it can be hidden by setting this option to 0
#SHOW_TREE 1

#define the color for the NULL values in the dataset. Use the letter X in the data to define the NULL values
#COLOR_NAN #000000

#automatically create and display a legend based on the color gradients and values defined below
#AUTO_LEGEND 1


#define the heatmap gradient colors. Values in the dataset will be mapped onto the corresponding color gradient.
COLOR_MIN #0000ff
COLOR_MAX #ff0000

#you can specify a gradient with three colors (e.g red to yellow to green) by setting 'USE_MID_COLOR' to 1, and specifying the midpoint color
#USE_MID_COLOR 1
#COLOR_MID #ffff00

#By default, color gradients will be calculated based on dataset values. You can force different values to use in the calculation by setting the values below:
#USER_MIN_VALUE 0
#USER_MID_VALUE 500
#USER_MAX_VALUE 1000

#border width; if set above 0, a border of specified width (in pixels) will be drawn around individual cells
#BORDER_WIDTH,0

#border color; used only when BORDER_WIDTH is above 0
#BORDER_COLOR,#0000ff


#Internal tree nodes can be specified using IDs directly, or using the 'last common ancestor' method described in iTOL help pages
#=================================================================#
#       Actual data follows after the "DATA" keyword              #
#=================================================================#
DATA\n
"""
fi.write(startstr)
for node in synth_tips:
    node_label = ret['label_map'][node].replace(' ','_')
    desc = int(node_annotation[node]['total_descendents'])
    fi.write("{} {}\n".format(node_label, desc))
        
fi.close()

## 