-
Notifications
You must be signed in to change notification settings - Fork 10
/
gene_mutation_rates.R
366 lines (318 loc) · 16.1 KB
/
gene_mutation_rates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#' Use dNdScv with tissue-specific covariates to calculate gene-level mutation rates
#'
#' This function calculates gene-level neutral mutation rates based on counts
#' of nonsynonymous and synonymous mutations per gene under the dNdScv package's model,
#' as described in \href{https://doi.org/10.1016/j.cell.2017.09.042}{Martincorena et al.}
#'
#' @param cesa CESAnalysis object
#' @param covariates Tissue-specific mutation rate covariates. Typically, supply the
#' covariates object from your refset (e.g., ces.refset.hg19$covariates$bladder), or the
#' object name ("bladder"). Run list_ces_covariates() to see choices. For hg19 data
#' only, set to "hg19" to use dNdScv's non-tissue-specific covariates. If no appropriate
#' covariates data are available, set to NULL to run without. Finally, you can also
#' supply custom covariates data in the form of a matrix or prcomp object (see website
#' for details).
#' @param samples Which samples to include in the current run. Defaults to all samples. Can be a
#' vector of Unique_Patient_Identifiers, or a data.table containing rows from the CESAnalysis
#' sample table.
#' @param dndscv_args Custom arguments to pass to dNdScv. (The arguments \code{mutations},
#' \code{gene_list}, \code{cv}, and \code{refdb} are supplied by cancereffectsizeR and can't be
#' substituted.)
#' @param save_all_dndscv_output Default FALSE; when TRUE, saves all dndscv output, not
#' just what's needed by cancereffectsizeR. (Full output can be very large, in the gigabytes.)
#' @return CESAnalysis object with gene-level mutation rates calculated
#' @export
# don't change this function at all without being sure you're not messing up tests
gene_mutation_rates <- function(cesa, covariates = NULL, samples = character(), dndscv_args = list(),
save_all_dndscv_output = FALSE) {
if (! is(cesa, "CESAnalysis")) {
stop("cesa expected to be a CESAnalysis object", call. = F)
}
cesa = copy_cesa(cesa)
cesa = update_cesa_history(cesa, match.call())
sample_info = select_samples(cesa, samples)
# select MAF records to use for gene mutation rate calculation (via dNdScv)
# for now, records from TGS samples are kept out; in the future, we could check out dNdScv's panel sequencing features
curr_run_grp_samples = sample_info$Unique_Patient_Identifier
sample_info = sample_info[coverage %in% c("exome", "genome")]
if(sample_info[, .N] == 0) {
stop("Cannot run dNdSCv because there are no whole-exome or whole-genome samples among the input samples.")
}
# Don't allow rates to be overwritten (too confusing for now to risk overlapping sample group specification in sequential calls)
if (! all(is.na(sample_info$gene_rate_grp))) {
msg = paste0("Gene mutation rates have already been calculated for some or all samples specified. If you ",
"want to re-run, run clear_gene_rates() first.")
stop(pretty_message(msg, emit = F))
}
dndscv_samples = sample_info$Unique_Patient_Identifier
if (! is(dndscv_args, "list") || uniqueN(names(dndscv_args)) != length(dndscv_args)) {
stop("dndscv_args should a named list of arguments to pass.")
}
reserved_args = c("mutations", "gene_list", "cv", "refdb")
if(any(reserved_args %in% names(dndscv_args))) {
stop("The following arguments are passed to dNdScv automatically and cannot be specified via ",
"dndscv_args:\n", paste(reserved_args, collapse = ", "), '.')
}
RefCDS = .ces_ref_data[[cesa@ref_key]]$RefCDS.dndscv
gr_genes = .ces_ref_data[[cesa@ref_key]]$gr_genes.dndscv
if(is.null(RefCDS)) {
RefCDS = .ces_ref_data[[cesa@ref_key]]$RefCDS
gr_genes = .ces_ref_data[[cesa@ref_key]]$gr_genes
full_gr_genes = gr_genes
} else {
full_gr_genes = .ces_ref_data[[cesa@ref_key]]$gr_genes
}
using_cds_rates = "gene" %in% names(GenomicRanges::mcols(gr_genes))
skip_covariate_validation = FALSE
if(is.null(covariates)){
skip_covariate_validation = TRUE
cv = NULL
genes_in_pca = NULL
warning("Calculating gene mutation rates with no covariate data; stop and re-run with covariates if available.\n",
"(Check with list_ces_covariates(); for hg19 only, you can also specify \"default\" for dNdScv default\n",
"covariates.)", call. = F, immediate. = T)
} else if (is(covariates, "character") && (covariates[1] == "default" || covariates[1] == "hg19")) {
skip_covariate_validation = ! using_cds_rates # will need to handle gene/CDS issue if using gene-based dNdScv covariates with CDS-based refset
if(cesa@advanced$genome_info$build_name == 'hg19') {
pretty_message("Loading dNdScv default covariates for hg19 (stop and re-run with tissue-specific covariates if available)...")
data("covariates_hg19", package = "dndscv", envir = environment())
cv = covs # dNdScv object from data() call
genes_in_pca <- rownames(cv)
} else {
stop("There is no default covariates data for this genome build, so you'll need to supply your own\n",
"or run without by setting covariates = NULL.", call. = F)
}
} else if (is(covariates, "prcomp") || is(covariates, 'matrix')) {
# To-do: validate custom covariates
cv = covariates
if(is(cv, "prcomp")) {
cv = cv$rotation
}
genes_in_pca = rownames(cv)
} else if (! is(covariates, "character") || length(covariates) != 1) {
stop("covariates expected to be 1-length character. Check available covariates with list_ces_covariates()")
} else {
covariates = paste0("covariates/", covariates)
if(! check_for_ref_data(cesa, covariates)) {
stop("Covariates could not be found. Check available covariates with list_ces_covariates().")
}
this_cov_pca = get_ref_data(cesa, covariates)
cv = this_cov_pca$rotation
genes_in_pca = rownames(cv)
}
# When gr_genes is protein-based, need to track which pids go with each gene identifier
if(using_cds_rates) {
# If rates are by CDS, synch-up gene-based covariates by assuming same covariates across each gene.
# In the future, CDS-based (or, maybe better, coordinate-based) covariates may be supported.
pid_to_gene = unique(as.data.table(GenomicRanges::mcols(gr_genes)))
setnames(pid_to_gene, 'names', 'pid')
}
# Validate covariates
if (! skip_covariate_validation) {
if (using_cds_rates) {
# For compatibility with cancereffectsizeR covariates that split isoforms for CDKN2A
# (The isoforms use the same covariates, so we can just copy one.)
current_cv_genes = rownames(cv)
if (pid_to_gene["CDKN2A", .N, on = "gene"] > 0 && ! "CDKN2A" %in% current_cv_genes & "CDKN2A.p14arf" %in% current_cv_genes) {
to_copy = cv["CDKN2A.p14arf", , drop = F]
rownames(to_copy) = "CDKN2A"
cv = rbind(cv, to_copy)
}
present_genes = intersect(pid_to_gene$gene, rownames(cv)) # some may be missing, which gets dealt with later
present_pid_to_gene = pid_to_gene[present_genes, on = "gene"]
cv = cv[present_pid_to_gene$gene, ]
rownames(cv) = genes_in_pca = present_pid_to_gene$pid
}
if(is.null(genes_in_pca)) {
stop("Covariates matrix must have gene names as rownames.")
}
if (length(genes_in_pca) != uniqueN(genes_in_pca)) {
stop("Some gene names are repeated in covariates matrix rownames.")
}
bad_genes = setdiff(genes_in_pca, get_ref_data(cesa, "gene_names"))
# Deal with ces.refset.hg19's special handling of CDKN2A
if ("CDKN2A" %in% bad_genes && cesa@ref_key == 'ces.refset.hg19' &&
! any(c("CDKN2A.p14arf", "CDKN2A.p16INK4a") %in% genes_in_pca)) {
to_insert = cv[c("CDKN2A", "CDKN2A"), ]
rownames(to_insert) = c("CDKN2A.p14arf", "CDKN2A.p16INK4a")
cv = cv[! rownames(cv) == "CDKN2A",]
cv = rbind(cv, to_insert)
bad_genes = setdiff(bad_genes, "CDKN2A")
genes_in_pca = rownames(cv)
}
# On CDS refsets, pid and gene names are not really properly validated yet
prop_bad = length(bad_genes)/length(genes_in_pca)
if(prop_bad > 0.2 && ! using_cds_rates) {
if(prop_bad == 1) {
stop('None of the gene names in the reference data are present in the covariates data.')
}
if(length(bad_genes) > 30) {
bad_genes = c(bad_genes[1:30], '...')
}
warning("Rownames of covariates matrix include genes not present in reference data:\n",
paste(bad_genes, collapse = ', '), '.')
}
if(length(bad_genes) > 0 && ! using_cds_rates) {
good_genes = intersect(setdiff(genes_in_pca, bad_genes), names(RefCDS))
genes_in_pca = good_genes
cv = cv[genes_in_pca, ]
}
}
mutations = cesa@maf[Unique_Patient_Identifier %in% dndscv_samples & variant_type == "snv",
.(Unique_Patient_Identifier, Chromosome, Start_Position, Reference_Allele, Tumor_Allele)]
if(mutations[, .N] == 0) {
stop("Can't run dNdScv because there are no usable SNV mutations in the input samples.")
}
# Run in separate function for quick unit tests of gene_mutation_rates
dndscv_args = c(list(mutations = mutations, gene_list = genes_in_pca, cv = cv, refdb = RefCDS,
gr_genes = gr_genes),
dndscv_args)
dndscv_output = do.call(run_dndscv, dndscv_args)
# Will want to get CIs on gene mutation rates. (Encapsulating use of predict for testing purposes.)
fit = get_dndscv_model_fit(dndscv_output)
theta = dndscv_output$nbreg$theta # parameter controlling variability in rates across genes
lower = exp(fit$fit - 1.96 * fit$se.fit)
upper = exp(fit$fit + 1.96 * fit$se.fit)
# Note exp(fit$fit) will be equivalent to dndscv_output$genemuts$exp_syn_cv
mle_rate = dndscv_output$genemuts$exp_syn_cv
# Get RefCDS data on number of synonymous mutations possible at each site
# Per dNdScv docs, L matrices list "number of synonymous, missense, nonsense and splice sites in each CDS at each trinucleotide context"
message("Extracting gene mutation rates from dNdScv output...")
dndscv_sample_num = uniqueN(dndscv_output$annotmuts$sampleID)
actual_syn_counts = dndscv_output$genemuts$n_syn
dndscv_gene_names = dndscv_output$genemuts$gene_name
nsyn_sites = sapply(RefCDS[dndscv_gene_names], function(x) colSums(x[["L"]])[1])
if(dndscv_output$nbreg$theta>1){
# see page e4 of dNdScv paper (Martincorena 2017, Cell)
mutrates_vec <- ((actual_syn_counts + theta - 1) /
(1 + (theta / mle_rate))) /
nsyn_sites / dndscv_sample_num
lower_rates = ((actual_syn_counts + theta - 1) /
(1 + (theta / lower))) /
nsyn_sites / dndscv_sample_num
upper_rates = ((actual_syn_counts + theta - 1) /
(1 + (theta / upper))) /
nsyn_sites / dndscv_sample_num
} else{
# theta should definitely be >1, so none of this should really happen
mutrates_vec = pmax(mle_rate,
(actual_syn_counts + theta - 1) / (1 +(theta / mle_rate))) /
nsyn_sites / dndscv_sample_num
# No CIs when theta<1
upper_rates = rep(NA, length(mutrates_vec))
lower_rates = upper_rates
}
curr_rate_group = as.integer(max(c(0, na.omit(cesa@samples$gene_rate_grp))) + 1)
rate_grp_colname = paste0("rate_grp_", curr_rate_group)
mutrates_dt = data.table(gene = dndscv_output$genemuts$gene_name, rate = mutrates_vec,
low = lower_rates, high = upper_rates)
ci_low_name = paste0(rate_grp_colname, '_95_low')
ci_high_name = paste0(rate_grp_colname, '_95_high')
setnames(mutrates_dt, c("rate", "low", "high"), c(rate_grp_colname, ci_low_name, ci_high_name))
# Genes in gr_genes that are not present in the covariates data (a few are missing,
# usually), won't get rates calculated by dNdScv. Here, we assign them the rate of the
# nearest gene, as measured by center-to-center distance. (And, in the case of CDS
# rates, this same logic works.)
if(using_cds_rates || is.null(full_gr_genes$gene)) {
missing_genes = setdiff(GenomicRanges::mcols(full_gr_genes)["names"][,1], mutrates_dt$gene)
} else {
missing_genes = setdiff(GenomicRanges::mcols(full_gr_genes)["gene"][,1], mutrates_dt$gene)
}
tmp = as.data.table(gr_genes)
gene_intervals = tmp[, .(chr = as.character(seqnames[1]), start = min(start), end = max(end)), by = "names"]
setnames(gene_intervals, "names", "gene")
gene_intervals[, center := start + trunc((end - start)/2)]
gene_intervals = gene_intervals[order(chr, center)]
present_genes = gene_intervals[! gene %in% missing_genes]
not_present = gene_intervals[gene %in% missing_genes]
# find nearest genes by chr/center to the missing genes from the present genes,
# and eliminate the rare tie by taking the first match for each gene
nearest_genes = present_genes[not_present, on = c("chr", "center"), roll = "nearest"]
nearest_genes = nearest_genes[! duplicated(i.gene)]
setkey(mutrates_dt, "gene")
nearest_rates = mutrates_dt[nearest_genes$gene, -"gene"]
missing_rates = cbind(gene = nearest_genes$i.gene, nearest_rates)
mutrates_dt = rbind(mutrates_dt, missing_rates)
# convert dNdScv's main output to data.table
setDT(dndscv_output$sel_cv)
cesa@samples[curr_run_grp_samples, gene_rate_grp := curr_rate_group, on = "Unique_Patient_Identifier"]
if(using_cds_rates) {
setnames(mutrates_dt, 'gene', 'pid')
mutrates_dt[pid_to_gene, gene := gene, on = 'pid']
setcolorder(mutrates_dt, c('pid', 'gene'))
setnames(dndscv_output$sel_cv, 'gene_name', 'pid')
dndscv_output$sel_cv[mutrates_dt, gene := gene, on = 'pid']
}
# keep just the main gene-level selection output from dNdScv, unless user wanted everything
if(! save_all_dndscv_output) {
dndscv_output = dndscv_output$sel_cv
}
if (curr_rate_group > 1) {
if (using_cds_rates) {
cesa@mutrates = cesa@mutrates[mutrates_dt[, -"gene"], on = "pid"]
} else {
cesa@mutrates = cesa@mutrates[mutrates_dt, on = "gene"]
}
} else {
cesa@mutrates = mutrates_dt
}
dndscv_output = list(dndscv_output)
names(dndscv_output) = rate_grp_colname
cesa@dndscv_out_list = c(cesa@dndscv_out_list, dndscv_output)
return(cesa)
}
#' This little function called by gene_mutation_rates() is separated for testing purposes.
#'
#' @keywords internal
get_dndscv_model_fit = function(dndscv_output) {
predict(dndscv_output$nbreg, se.fit = T)
}
#' Internal function to run dNdScv
#'
#' @keywords internal
run_dndscv = function(mutations, gene_list, cv, refdb, gr_genes, ...) {
# hacky way of forcing an object of name gr_genes into the dndscv::dndscv function environment,
# since the object is required by dndscv but there's no argument to supply your own copy of it
our_env = new.env(parent = environment(dndscv::dndscv))
our_env$gr_genes = gr_genes
our_dndscv = dndscv::dndscv
environment(our_dndscv) = our_env
message("Running dNdScv...")
withCallingHandlers(
{
dndscv_raw_output = our_dndscv(mutations = mutations, gene_list = gene_list, cv = cv, refdb = refdb, ...)
}, error = function(e) {
if (startsWith(conditionMessage(e), "bad 'file' argument")) {
stop("You need to update dNdScv. Try running \"remotes::update_packages(packages = \"dndscv\")\".")
}
}, warning = function(w) {
dndscv_msg = conditionMessage(w)
# Recurrent mutations are presumed real in CES
# As of 09/03/20, dNdScv's MNV check is flawed (and CES has already handled them in load_maf())
if (startsWith(dndscv_msg, "Same mutations observed in different sampleIDs") ||
startsWith(dndscv_msg, "Mutations observed in contiguous sites")) {
invokeRestart("muffleWarning")
}
}
)
return(dndscv_raw_output)
}
#' Clear regional mutation rates
#'
#' Remove all gene/coding region mutation rates, usually in order to re-run with different
#' parameters without having to create a new CESAnalysis.
#'
#' @param cesa CESAnalysis
#' @return The CESAnalysis with rates cleared
#' @export
clear_gene_rates = function(cesa = NULL) {
if (! is(cesa, "CESAnalysis")) {
stop("cesa should be a CESAnalysis")
}
cesa = copy_cesa(cesa)
cesa@dndscv_out_list = list()
cesa@mutrates = data.table()
cesa@samples[, gene_rate_grp := NA_integer_]
cesa = update_cesa_history(cesa, match.call())
return(cesa)
}