forked from tamuri/BIOL0033
-
Notifications
You must be signed in to change notification settings - Fork 0
/
practical_2.R
303 lines (243 loc) · 11.3 KB
/
practical_2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
# ---------- DOWNLOAD THE PRACTICAL ----------
#
# 1. Download the .zip file for the practical from
#
# https://github.com/tamuri/BIOL0033/archive/master.zip
#
# 2. Extract the files
# ---------- SET THE WORKING DIRECTORY ----------
# This is the place where you put the files for the practical,
# so you should modify the path within brackets below with
# the path to the directory where you have these files on
# your PC
setwd( '~/Documents/2019/BIOL0033' )
# Alternatively, you can also automatically find the path
# to the R script you are using if you load a specific
# function from the `rstudioapi` library. Follow the
# instructions below:
#
# 1. Load the package `rstudioapi`. If you do not have
# it installed, then uncomment and run the
# command below
# install.packages( "rstudioapi" )
library( rstudioapi )
# 2. Get the path to current open R script
path_to_file <- rstudioapi::getActiveDocumentContext()$path
# 3. Now just keep the path to the directory where the
# R script is
wd <- paste( dirname( path_to_file ), "/", sep = "" )
# 4. Set the working directory
setwd( wd )
# ---------- LOAD THE REQUIRED PACKAGES ----------
# `ape` and `phangorn` provide vast array of phylogenetic analyses
library( ape )
library( phangorn )
# Set the outgroup for our sequences
outg <- c( 'cow', 'pig' )
# ---------- ESTIMATE TREE BY MAXIMUM PARSIMONY ----------
# Use the `phangorn` library to read the alignment you generated
# in practical 1
aln <- phangorn::read.phyDat( file = 'nadh6.8apes.aln.fasta', format = 'fasta' )
# Use a random starting tree
# HINT: You might want to set a seed number before generating the random
# tree in case you want to reproduce the same results!
set.seed( 12345 )
rnd_tree <- ape::rtree( n = length( aln ), tip.label = names( aln ) )
ape::plot.phylo( rnd_tree )
# Q: What's the parsimony score for your starting tree?
phangorn::parsimony( tree = rnd_tree, data = aln )
# Optimise the tree using maximum parsimony
pars_tree_unr <- phangorn::optim.parsimony( tree = rnd_tree, data = aln )
pars_tree <- ape::ladderize( ape::root( phy = pars_tree_unr, outgroup = outg, resolve.root = TRUE ) )
ape::plot.phylo( pars_tree )
# Q: What's the parsimony score of the maximum parsimony tree?
# Q: How many operations did it take to find the MP tree?
# Q: What's missing? Look at the Newick representation
write.tree( pars_tree )
# We can look at the changes on the tree for any given site.
# We will be looking at the 12th site!
anc_pars <- phangorn::ancestral.pars( tree = pars_tree, data = aln )
phangorn::plotAnc( tree = pars_tree, data = anc_pars, i = attr( x = anc_pars, which = 'index' )[12],
col = c( "green", "blue", "black", "red" ) )
# EXERCISE 1:
#
# 1. What's the parsimony score for site 12 (assume uniform cost matrix)?
#
# 2. Are the following sites parsimony informative: 5, 12, 57 and 492?
# ---------- ESTIMATE TREE BY MAXIMUM LIKELIHOOD ----------
# Package up a start tree (NJ), alignment and model information into a likelihood object
fit_ini <- phangorn::pml( tree = nj( phangorn::dist.hamming( aln ) ), data = aln, model = 'JC' )
# Optimise the tree+alignment+model using maximum likelihood (ML)
fit <- phangorn::optim.pml( object = fit_ini, optNni = TRUE, model = 'JC' )
# Get the ML tree
ml_tree <- fit$tree
# The maximum log-likelihood
fit$logLik
# The number of parameters in the model
fit$df
# Root the ML tree using the outgroup species
ml_tree <- ape::ladderize( phy = ape::root( phy = ml_tree, outgroup = outg, resolve.root = TRUE ) )
# Plot the ML tree
ape::plot.phylo( ml_tree )
ape::add.scale.bar()
# Plotting without the outgroup
ape::plot.phylo( x = ape::drop.tip( phy = ml_tree, tip = outg ) )
ape::add.scale.bar()
# EXERCISE 2:
#
# 1. What's the ML when using the K80 model? What is the ML estimate of the kappa parameter?
#
# 2. What's the likelihood ratio test statistic? Does the K80 model offer a significantly
# better fit to the data?
# (Chi-sq with 1 degree of freedom has critical values of 5%: 3.84; 1%: 6.63)
#
# 3. Try a more complex model and answer the following.
# E.g., try HKY+G (transition/transversion bias, unequal base frequencies, site rate variation)
#
# Now, create a new initial likelihood object that includes the gamma rate categories:
#
# fit_ini <- pml(nj(dist.hamming(aln)), aln, model='HKY', k=4)
#
# Then, run the following:
#
# optim.pml(fit_ini, optNni=TRUE, model='HKY', optGamma=TRUE)
#
# a) What is the MLE of the transition/transversion bias?
# b) What is the MLE gamma shape parameter? Is this high or low site rate variation?
# c) How does the total tree length compare with the simple JC model above?
# ---------- GETTING BOOTSTRAP SUPPORT VALUES OF TREE (NJ) ----------
# Bootstrap allows us to assign confidence scores to splits in our phylogenetic tree
# Load our real alignment (ape-package requires use of the `read.dna` function)
aln <- ape::read.dna( file = 'nadh6.8apes.aln.fasta', format = 'fasta' )
# Calculate the NJ tree for this alignment
nj_tree <- ape::nj( X = ape::dist.dna( x = aln ) )
nj_tree <- ape::root( phy = nj_tree, outgroup = outg )
# Look at the tree
ape::plot.phylo( x = nj_tree )
# Calculate the bootstrap support for each split in the tree.
# The `boot.phylo` function is provided by the ape package.
# The function pass to `FUN` should be exactly what you did on the real data to estimate the tree
boots <- ape::boot.phylo( phy = nj_tree,
x = aln,
FUN = function( bs_aln ) ape::nj( X = ape::dist.dna( x = bs_aln ) ),
trees = TRUE)
# Plot the tree with bootstrap support
ape::plot.phylo( x = nj_tree, main='nj tree w/ bootstrap support')
ape::add.scale.bar()
# ape::nodelabels( boots$BP ) # SAC-210226: Not used anymore as this prints
# an extra bootstrap value for the root
# on the tree, which should not be printed.
# Use command below:
ape::drawSupportOnEdges( value = boots$BP )
# SAC-210226: It is also recommended that node labels are added
# before rooting the tree. Therefore:
nj_tree_unr <- ape::nj( X = ape::dist.dna( x = aln ) )
nj_tree_unr$node.label <- boots$BP
nj_tree_bs <- ape::root( phy = nj_tree, outgroup = outg,
edgelabel = TRUE )
ape::plot.phylo( x = nj_tree_bs, main='nj tree w/ bootstrap support' )
ape::drawSupportOnEdges( value = nj_tree_bs$node.label )
# HINT: Save the tree and look at it in `Figtree` - it might be clearer!
nj_tree$node.label <- boots$BP
ape::write.tree( phy = nj_tree, file='nadh6.8apes.bs.tree')
# Get the majority-rule consensus tree
con1 <- ape::consensus( boots$trees, p = 0.5, check.labels = TRUE)
ape::plot.phylo( x = ape::root( phy = con1, outgroup = outg, resolve.root = TRUE ),
main = 'majority-rule consensus' )
# Get the strict consensus tree
con2 <- ape::consensus( boots$trees, p = 1, check.labels = TRUE )
ape::plot.phylo( x = ape::root( phy = con2, outgroup = outg, resolve.root = TRUE ),
main = 'strict consensus' )
# EXERCISE 3:
#
# 1. Why are the majority-rule and strict consensus trees different?
#
# 2. Which split has the lowest support according to bootstrap analysis?
# ---------- GETTING BOOTSTRAP SUPPORT VALUES OF TREE (ML) ----------
# Load the alignment (ape-package requires use of the `read.dna` function)
aln <- ape::read.dna( file = 'nadh6.8apes.aln.fasta', format = 'fasta' )
# A function to estimate the ML tree for this alignment
# See section 'ESTIMATE TREE BY MAXIMUM LIKELIHOOD' above for details
# (model is JC in this example, but you can change)
get_ml_tree <- function(x) {
x_pd <- phangorn::as.phyDat( x = x )
fit_ini <- phangorn::pml( tree = ape::nj( phangorn::dist.hamming( x = x_pd ) ),
data = x_pd, model = 'JC' )
fit <- phangorn::optim.pml( object = fit_ini, optNni = TRUE, model = 'JC' )
return ( fit$tree )
}
# Run the function to get the tree for the real data
ml_tree <- get_ml_tree( x = aln )
# Look at the tree
ape::plot.phylo( x = ml_tree )
# Calculate the bootstrap support for each split in the tree.
# The `boot.phylo` function is provided by the ape package.
# The function passed to `FUN` should be exactly what you did
# on the real data to estimate the tree.
# Here, we pass the `get_ml_tree` function we defined above:
boots <- ape::boot.phylo( phy = ml_tree,
x = aln,
FUN = function( bs_aln ) get_ml_tree( x = bs_aln ),
trees = TRUE )
# Plot the tree with bootstrap support
ape::plot.phylo( x = ml_tree, main = 'ml tree w/ bootstrap support' )
ape::add.scale.bar()
# SAC-220207: Not used anymore as this prints an extra bootstrap value for the root
# on the tree, which should not be printed.
#ape::nodelabels( boots$BP )
ape::drawSupportOnEdges( boots$BP )
# Save the tree with BS values and look at it in `Figtree`.
ml_tree$node.label <- boots$BP
ape::write.tree( phy = ml_tree, file = 'nadh6.8apes.ml.bs.tree' )
# ---------- LRT OF MOLECULAR CLOCK ----------
# Test the molecular clock hypothesis
aln <- phangorn::read.phyDat( file = 'nadh6.8apes.aln.fasta', format = 'fasta')
# Optimise the branch lengths under the molecular clock
H0_ini <- phangorn::pml( tree = phangorn::upgma( phangorn::dist.hamming(x = aln ) ),
data = aln, model = 'JC' )
H0_opt <- phangorn::optim.pml( object = H0_ini, optNni = TRUE,
optRooted = TRUE, model = 'JC' )
# Optimise the branch lengths without the molecular clock
H1_ini <- phangorn::pml( tree = phangorn::upgma( phangorn::dist.hamming( x = aln ) ),
data = aln, model = 'JC' )
H1_opt <- phangorn::optim.pml( object = H1_ini, optNni = TRUE,
optRooted=FALSE, model = 'JC' )
# Q: Plot the clock tree and unconstrained tree
# EXERCISE 4:
#
# Perform the likelihood ratio test and answer the questions below:
#
# 1. How do we calculate the test statistic? What is the value?
#
# 2. Calculate the degrees of freedom between the simpler and more complex model.
#
# 3. What is the p-value given the value of test statistic?
# HINT: you can use `pchisq`!
#
# 4. Do we accept or reject the hypothesis of the molecular clock?
# --------- EXTRA: USE PHYML ONLINE TO GET THE ML TREE WITH BOOTSTRAP ----------
# EXERCISE 4:
#
# Visit http://www.atgc-montpellier.fr/phyml/ in your browser
#
# First, convert the FASTA-format alignment to PHYLIP-format
aln <- ape::read.dna( file = 'nadh6.8apes.aln.fasta', format = 'fasta' )
ape::write.dna( x = aln, file = 'nadh6.8apes.aln.phylip', format = 'interleaved' )
# Have a look at the PHYLIP file and compare with FASTA format
# Then, upload the PHYLIP file as input data
#
# Select the options:
# a) Let PhyML automatically select the substitution model
# b) Get bootstrap support for the tree
# c) Leave the rest as defaults
#
# Finally, submit the job and wait for results in your email
# 1. What was the best model according to the PhyML selection criteria?
#
# 2. What was the maximum log-likelihood of the data given the model?
#
# 3. Is there any nucleotide bias?
#
# 4. How does the PhyML tree & support values compare with the tree you estimated above?
# (You can use FigTree to open the tree file - add the bootstrap support 'label' to the nodes)
# Which split has lowest bootstrap support?