The goal of mmsig is to provide a flexible and easily interpretable mutational signature analysis tool. mmsig was developed for hematological malignancies, but can be extended to any cancer with a well-known mutational signature landscape.
mmsig is based on an expectation maximization algorithm for mutational signature fitting and applies cosine similarities for dynamic error suppression as well as bootstrapping-based confidence intervals and assessment of transcriptional strand bias.
Citation: Rustad, E.H., Nadeu, F., Angelopoulos, N. et al. mmsig: a fitting approach to accurately identify somatic mutational signatures in hematological malignancies. Commun Biol 4, 424 (2021). https://doi.org/10.1038/s42003-021-01938-0
You can install the development version from GitHub with:
# install.packages("devtools")
devtools::install_github("evenrus/mmsig")
This is a basic example which shows mmsig usage
library(mmsig)
#> Loading required package: BSgenome.Hsapiens.UCSC.hg19
#> Loading required package: BSgenome
#> Loading required package: BiocGenerics
#> Loading required package: parallel
#>
#> Attaching package: 'BiocGenerics'
#> The following objects are masked from 'package:parallel':
#>
#> clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
#> clusterExport, clusterMap, parApply, parCapply, parLapply,
#> parLapplyLB, parRapply, parSapply, parSapplyLB
#> The following objects are masked from 'package:stats':
#>
#> IQR, mad, sd, var, xtabs
#> The following objects are masked from 'package:base':
#>
#> anyDuplicated, append, as.data.frame, basename, cbind, colnames,
#> dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
#> grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
#> order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
#> rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
#> union, unique, unsplit, which.max, which.min
#> Loading required package: S4Vectors
#> Loading required package: stats4
#>
#> Attaching package: 'S4Vectors'
#> The following object is masked from 'package:base':
#>
#> expand.grid
#> Loading required package: IRanges
#> Loading required package: GenomeInfoDb
#> Loading required package: GenomicRanges
#> Loading required package: Biostrings
#> Loading required package: XVector
#>
#> Attaching package: 'Biostrings'
#> The following object is masked from 'package:base':
#>
#> strsplit
#> Loading required package: rtracklayer
data(mm_5_col)
data(signature_ref)
# remove canonical AID (SBS84) for genome-wide analysis
# remove the platinum signature (SBS35) because the patients are not platinum exposed
sig_ref <- signature_ref[c("sub", "tri", "SBS1", "SBS2", "SBS5", "SBS8",
"SBS9", "SBS13", "SBS18", "SBS-MM1")]
head(sig_ref)
#> sub tri SBS1 SBS2 SBS5 SBS8 SBS9 SBS13 SBS18
#> 1 C>A ACA 0.000886 5.80e-07 0.01200 0.04410 0.000558 0.001820 0.05150
#> 2 C>A ACC 0.002280 1.48e-04 0.00944 0.04780 0.004090 0.000721 0.01580
#> 3 C>A ACG 0.000177 5.23e-05 0.00185 0.00462 0.000426 0.000264 0.00243
#> 4 C>A ACT 0.001280 9.78e-05 0.00661 0.04700 0.003050 0.000348 0.02140
#> 5 C>A CCA 0.000312 2.08e-04 0.00743 0.04010 0.004800 0.001400 0.07400
#> 6 C>A CCC 0.001790 9.53e-05 0.00614 0.03880 0.001920 0.000968 0.01960
#> SBS-MM1
#> 1 0.00000382
#> 2 0.00000696
#> 3 0.00005140
#> 4 0.00003070
#> 5 0.02199082
#> 6 0.00000258
# subset three samples to reduce run time
mm_5_col_subset <- mm_5_col[mm_5_col$sample %in% c("MEL_PD26412a", "MEL_PD26411c", "PD26414a"),]
head(mm_5_col_subset)
#> sample chr pos ref alt
#> 96207 MEL_PD26411c chr1 1606928 G C
#> 96208 MEL_PD26411c chr1 2900399 C T
#> 96209 MEL_PD26411c chr1 3003910 G A
#> 96210 MEL_PD26411c chr1 3085231 A G
#> 96211 MEL_PD26411c chr1 3435711 A G
#> 96212 MEL_PD26411c chr1 4074739 A T
# Bootstrapping large datasets with many iterations can significantly increase runtime.
set.seed(1)
sig_out <- mm_fit_signatures(muts.input=mm_5_col_subset,
sig.input=sig_ref,
input.format = "vcf",
sample.sigt.profs = NULL,
strandbias = TRUE,
bootstrap = TRUE,
iterations = 20, # 1000 iterations recommended for stable results
refcheck=TRUE,
cos_sim_threshold = 0.01,
force_include = c("SBS1", "SBS5"),
dbg=FALSE)
#> | | | 0% | |======================= | 33% | |=============================================== | 67% | |======================================================================| 100%
plot_signatures(sig_out$estimate,
samples = T,
sig_order = c("SBS1", "SBS2", "SBS13", "SBS5", "SBS8", "SBS9",
"SBS18", "SBS-MM1", "SBS35"))
bootSigsPlot(sig_out$bootstrap)
head(sig_out$strand_bias_mm1)
#> group transcribed untranscribed ratio p_poisson MM1_flag
#> 1 MEL_PD26411c 192 119 1.6134454 0.00004127438 *
#> 2 MEL_PD26412a 193 148 1.3040541 0.01705720483 *
#> 3 PD26414a 29 30 0.9666667 1.00000000000