-
Notifications
You must be signed in to change notification settings - Fork 2
/
analysis.R
273 lines (252 loc) · 11.3 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
source("../../../../../../GitHub/polyDFE/postprocessing.R")
###############################################################################
# investigate polyDFE output
###############################################################################
# parse polyDFE output
est = parseOutput("central_chimp_A.txt")
# est has 4 different runs of polyDFE
length(est)
# each run contains information about
# the input file that polyDFE was ran on
# the DFE model used, the best likelihood found and the corresponding gradient
# the values of all parameters (including those that were fixed and not estimated)
# the parameters that where estimated, the number of sampled sequences in the data n
# various expectations under the parameters and the estimated alpha
names(est[[1]])
# the input file used
est[[1]]$input
# the model used
est[[1]]$model
# the likelihood and gradient
est[[1]][c("lk", "grad")]
# the values of all parameters
est[[1]]$values
# the parameters polyDFE estimated
est[[1]]$estimated
# the values of the estimated parameters
est[[1]]$values[est[[1]]$estimated]
# est contains, for both neutral and selected sites,
rownames(est[[1]]$expec)
# the expected SFS, divergence counts (if relevant) and misattributed polymorphism
colnames(est[[1]]$expec)
# the estimated alpha
est[[1]]$alpha
# what are the estimated gradients
est = c(parseOutput("central_chimp_A.txt"), parseOutput("central_chimp_C.txt"),
parseOutput("central_chimp_Del.txt"))
# what are the gradients?
grad = sapply(est, function(e) e$grad)
names(grad) = sapply(est, getModelName)
grad
# which gradients are larger than 0.001?
grad[which(grad > 0.001)]
###############################################################################
# discretize the DFE
###############################################################################
getDiscretizedDFE(est[[1]])
getDiscretizedDFE(est[[1]], c(-100, -10, -1, 0, 1))
# we can easily calculate the discretized DFE for different models
dfe = t(sapply(est, getDiscretizedDFE, c(-100, -10, -1, 0, 1)))
# give names to the rows in dfe to use in the barplot legend
rownames(dfe) = sapply(est, getModelName)
dfe
# initialize plotting area
pdf("fig1.pdf", width = 12.6, height = 5)
par(mfrow = c(1, 2), mar = c(2, 2, 4, 0) + 0.1, xpd = TRUE)
barplot(dfe, beside = TRUE, legend.text = TRUE,
angle = rep(c(0, 45, -45), each = 4), density = 20, col = 1:4)
# applying log scale on the y-axis enables an easier visual comparison
# for that, we need to replace 0s with NAs
dfe[which(dfe == 0)] = NA
col = rainbow(n = nrow(dfe), end = 0.9)
barplot(dfe, beside = TRUE, log = 'y', col = col)
# draw x axis
axis(1, lwd.tick = 0, labels = FALSE)
# add legend
legend(5, 9, legend = rownames(dfe), fill = col, bty = "n", ncol = 3)
dev.off()
###############################################################################
# calculate alpha
###############################################################################
est = parseOutput("central_chimp_C.txt")
# extract alpha for all the models in est
sapply(est, function(e) e$alpha)
# when a deleterious DFE is inferred, alpha_dfe is always 0
est = parseOutput("central_chimp_Del.txt")
sapply(est, function(e) e$alpha["alpha_dfe"])
# calculate both alpha_div and alpha_dfe using R
est = c(parseOutput("central_chimp_A.txt"), parseOutput("central_chimp_C.txt"),
parseOutput("central_chimp_Del.txt"))
# first, parse divergence data from the polyDFE input file
div = parseDivergenceData("central_chimp_sfs")
# extract and calculate alpha for all the models in est
alpha = sapply(est, function(e) c(e$alpha["alpha_dfe"],
"R alpha_dfe" = estimateAlpha(e),
e$alpha["alpha_div"],
"R alpha_div" = estimateAlpha(e, div = div)))
summary(alpha["alpha_dfe", ] - alpha["R alpha_dfe", ])
summary(alpha["alpha_div", ] - alpha["R alpha_div", ])
# control the correction for misattributed polymorphism
# calculate alpha with and without the correction for all the models in est
alpha = sapply(est, function(e) c("+ corr" = estimateAlpha(e, div = div),
"- corr" = estimateAlpha(e, div = div, poly = FALSE)))
summary(alpha["+ corr", ] - alpha["- corr", ])
# control the S_sup limits
est = c(parseOutput("central_chimp_A.txt"), parseOutput("central_chimp_C.txt"))
alpha = sapply(est, function(e) c("supLimit = 0" = estimateAlpha(e),
"supLimit = 5" = estimateAlpha(e, supLimit = 5),
"supLimit = 10" = estimateAlpha(e, supLimit = 10)))
summary(alpha["supLimit = 0", ] - alpha["supLimit = 5", ])
summary(alpha["supLimit = 5", ] - alpha["supLimit = 10", ])
###############################################################################
# hypothesis testing
###############################################################################
# compare sequentially models in two different files
compareModels("central_chimp_A.txt", "central_chimp_Del.txt")
# compare models within the same file
est = parseOutput("central_chimp_A.txt")
compareModels(est[3], est[4])$LRT
# enforce nestedness
compareModels("central_chimp_C.txt", "central_chimp_Del.txt")$LRT
compareModels("central_chimp_C.txt", "central_chimp_Del.txt", nested = TRUE)
###############################################################################
# r can account for polarization error
###############################################################################
# when r is not estimated, eps_an is estimated to be non-zero
# when r is estimated, eps_an is estimated to be zero
# AIC and LRT support this
aic = list()
for (f in c("central_chimp_A.txt", "central_chimp_C.txt", "central_chimp_Del.txt"))
{
est = parseOutput(f)
without_r = compareModels(est[1], est[2])
with_r = compareModels(est[3], est[4])
aic[[f]] = rbind("- r" = c(without_r$AIC[, "AIC model 1"], without_r$AIC[, "AIC model 2"],
without_r$LRT[, "p-value"], est[[2]]$values["eps_an"]),
"+ r" = c(with_r$AIC[, "AIC model 1"], with_r$AIC[, "AIC model 2"],
with_r$LRT[, "p-value"], est[[4]]$values["eps_an"]))
}
aic
###############################################################################
# model averaging
###############################################################################
est = c(parseOutput("central_chimp_A.txt"),
parseOutput("central_chimp_C.txt"),
parseOutput("central_chimp_Del.txt"))
# obtain the AIC weights for all the models in est
aic = getAICweights(est)
aic
# calculate model-averaged alpha
alpha_div = sum(sapply(1:length(est),
function(i) aic[i, "weight"] * est[[i]]$alpha["alpha_div"]))
alpha_dfe = sum(sapply(1:length(est),
function(i) aic[i, "weight"] * est[[i]]$alpha["alpha_dfe"]))
alpha_div
alpha_dfe
###############################################################################
# should divergence data be used?
###############################################################################
# delta AIC and alpha values are impacted by type of data used
est = list(c(parseOutput("central_chimp_A.txt"),
parseOutput("central_chimp_C.txt"),
parseOutput("central_chimp_Del.txt")),
c(parseOutput("central_chimp_A_no_div.txt"),
parseOutput("central_chimp_C_no_div.txt"),
parseOutput("central_chimp_Del_no_div.txt")))
for (i in 1:2)
{
# calculate aic weights
aic = getAICweights(est[[i]])
# set model names to the rows
rownames(aic) = sapply(est[[i]], getModelName)
aic = cbind(aic, t(sapply(est[[i]], function(e) e$alpha)))
# re-order after delta aic
aic = aic[order(aic[, "delta AIC"]), ]
# visualize aic and estimated alpha
print(aic)
# calculate model-averaged alpha_dfe
print(sum(apply(aic, 1, function(a) a["weight"] * a["alpha_dfe"])))
}
# compare visually the SFS
# parse the observed SFS
sfs = parseSFSData("central_chimp_sfs")
sfs
# parse expected SFS from polyDFE for best models according to AIC
est = parseOutput("central_chimp_C.txt")[[3]]
sfs_div = est$expec
sfs_no_div = parseOutput("central_chimp_A_no_div.txt")[[3]]$expec
# normalize expected SFS by multiplying with the number of sites analyzed
sfs_div = sfs_div * sfs[, "length_sfs"]
sfs_no_div = sfs_no_div * sfs[, "length_sfs"]
# sample size
n = max(grep("E[P", colnames(sfs_div), fixed = TRUE))
# calculate range of the y-axis for the plot
m = min(sfs[, 1:n], sfs_div[, 1:n], sfs_no_div[, 1:n])
M = max(sfs[, 1:n], sfs_div[, 1:n], sfs_no_div[, 1:n])
# initialize plot, make it log-scale on the y-axis
pdf("fig2.pdf", width = 12.6, height = 4)
par(mfrow = c(1, 1), xpd = FALSE, mar = c(4, 4, 0, 0) + 0.1)
plot(1, 1, type = "n", ylim = c(m, M), xlim = c(1, n),
xlab = "i", ylab = "SFS", log = "y")
# create background colors to easily differentiate between entries in the SFS
xleft = seq(from = 1, to = n, by = 2)
rect(xleft = xleft - 0.5, ybottom = m * 0.001,
xright = xleft + 0.5, ytop = M * 100,
col = gray(0.9), border = NA)
# re-draw box around plot
box()
# plot both the neutral and selected SFS
for (i in 1:2)
{
# first, the expected SFS when divergence data was used
points(1:n - 0.15, sfs_div[i, 1:n], col = "red", pch = 19 + i)
# then the expected SFS when divergence data was not used
points(1:n + 0.15, sfs_no_div[i, 1:n], col = "blue", pch = 19 + i)
# then the observed SFS
segments(x0 = 1:n - 0.35, x1 = 1:n + 0.35, y0 = sfs[i, 1:n], col = "black")
}
# add legend
legend("topright", bty = "n", pch = c(NA, 21, 21, 20, 21), lwd = c(1, NA, NA, NA, NA),
col = c("black", "red", "blue", "black", "black"),
legend = c("obs", "expec with div", "expec without div", "neutral", "selected"))
dev.off()
# perfrom goodness-of-fit test
sfs_both = rbind(sfs_div, sfs_no_div)
fit = sapply(1:nrow(sfs_both),
function(i)
unlist(chisq.test(x = sfs_both[i, 1:n],
p = sfs[(i + 1) %% 2 + 1, 1:n],
rescale.p = TRUE)[c("statistic", "p.value")]))
colnames(fit) = paste(rep(c("div", "no div"), each = 2), rownames(sfs_div))
fit["p.value", ]
# check overall fit
c(div = sum(fit["statistic.X-squared", 1:2]),
no_div = sum(fit["statistic.X-squared", 3:4]))
# use model-averaged expected SFS
est = list(est_div, est_no_div)
expec_sfs = list()
for (i in 1:2)
{
# calculate aic weights
aic = getAICweights(est[[i]])
# calculate model-averaged sfs
expec = lapply(1:length(est[[i]]),
function(j) aic[j, "weight"] * est[[i]][[j]]$expec)
expec_sfs[[i]] = sapply(1:n,
function(i)
rowSums(sapply(expec, function(l) l[, i])))
# normalize expected SFS by multiplying with the number of sites analyzed
expec_sfs[[i]] = expec_sfs[[i]] * sfs[, "length_sfs"]
}
# perfrom goodness-of-fit test
sfs_both = rbind(sfs_div, sfs_no_div)
fit = sapply(1:nrow(sfs_both),
function(i)
unlist(chisq.test(x = sfs_both[i, 1:n],
p = sfs[(i + 1) %% 2 + 1, 1:n],
rescale.p = TRUE)[c("statistic", "p.value")]))
colnames(fit) = paste(rep(c("div", "no div"), each = 2), rownames(sfs_div))
fit["p.value", ]
# check overall fit
c(div = sum(fit["statistic.X-squared", 1:2]),
no_div = sum(fit["statistic.X-squared", 3:4]))