diff --git a/DifferentialMutationAnalysis.R b/DifferentialMutationAnalysis.R index 0072201..2ddf4bf 100644 --- a/DifferentialMutationAnalysis.R +++ b/DifferentialMutationAnalysis.R @@ -23,6 +23,9 @@ # # DifferentialMutationAnalysis("Data/BRCA_sample.maf", p=0) # +# If you have the following three R packages: "data.table", "plyr", and +# "Matrix" you can set the flag usePackages to TRUE to significantly decrease +# file read time ############################################################################### #helper function to parse MAF file @@ -47,13 +50,13 @@ bins = function(v, p=100) { #compute unidirectional EMD between mutationas and variations uEMD = function(tBins, nBins, p=100) { - sum = 0 - move = 0 - for(i in p:2){ - move = move+tBins[i]-nBins[i] - sum = sum+max(move,0) - } - sum + sum = 0 + move = 0 + for(i in p:2){ + move = move+tBins[i]-nBins[i] + sum = sum+max(move,0) + } + sum } #generate random uEMDs to compute FDRs @@ -80,23 +83,35 @@ computeQ = function(FDRs, uEMDscores) { DifferentialMutationAnalysis = function(mafFile, geneType="all", p=5, outDir = "Output/", protFile = "Data/protNames.txt", - natBinFile = "Data/natDistBinned.txt") { + natBinFile = "Data/natDistBinned.txt", + usePackages = FALSE) { + + if(usePackages){ + library("data.table", quietly=TRUE) + library("plyr", quietly=TRUE) + library("Matrix", quietly=TRUE) + } #A list of protein names protNames = read.table(protFile, stringsAsFactors=FALSE)$V1 #load ranked binned natural variation count data - nRankBinned = read.table(natBinFile) + if(!usePackages){ + nRankBinned = read.table(natBinFile) + } + else{ + nRankBinned = fread(natBinFile, data.table=FALSE) + } #determine if we want to find all cancer genes or just oncogenes or TSGs if(geneType=="onco"){ - tCount = parseMaf(protNames, mafFile, "Missense_Mutation") + tCount = parseMaf(protNames, mafFile, usePackages, "Missense_Mutation") } if(geneType=="TSG"){ - tCount = parseMaf(protNames, mafFile, "Nonsense_Mutation") + tCount = parseMaf(protNames, mafFile, usePackages, "Nonsense_Mutation") } else{ - tCount = parseMaf(protNames, mafFile) + tCount = parseMaf(protNames, mafFile, usePackages) } #rank normalize mutations diff --git a/parseMaf.R b/parseMaf.R index bc91013..3ba795f 100644 --- a/parseMaf.R +++ b/parseMaf.R @@ -7,31 +7,62 @@ # 2. some gene names are poorly annotated #This simple parser does not address these issues -parseMaf = function(protNames, mafFile, mutTypes=c("Missense_Mutation", "Nonsense_Mutation")) { - ids = c() - tCount = c() - for(line in readLines(mafFile)){ - #find the protein name - split = strsplit(line, "\t")[[1]] - protInd = match(split[1], protNames) - - #check if protein name is in list - if(is.na(protInd)){next} - - #check mutation type - if(!any(sapply(mutTypes, function(x) grepl(x,line)))){next} - - #find sample id and add to list - id = split[grepl("TCGA", split)][1] - idInd = match(id, ids) - if(is.na(idInd)){ - ids = c(ids, id) - idInd = length(ids) - tCount = rbind(tCount, rep(0, length(protNames))) - } - - #add to count for protein/sample pair - tCount[idInd, protInd] = tCount[idInd, protInd]+1 - } - tCount +parseMaf = function(protNames, mafFile, usePackages, mutTypes=c("Missense_Mutation", "Nonsense_Mutation")) { + + tryCatch({ + + if(!usePackages){stop("usePackages")} + + #Read maf file skipping comment lines + mut = fread(mafFile, skip="Hugo_Symbol", header=TRUE, fill = TRUE) + + #Trim proteins and mutation types + mut = mut[mut$Hugo_Symbol %in% protNames & mut$Variant_Classification %in% mutTypes,] + + #If ids are TCGA barcodes, trim them for uniqueness + if(grepl("TCGA", mut$Tumor_Sample_Barcode[1])){ + mut$Tumor_Sample_Barcode = sapply(mut$Tumor_Sample_Barcode, function(x) paste(strsplit(x, "-")[[1]][1:4], collapse="-")) + } + ids = unique(mut$Tumor_Sample_Barcode) + + #Count mutations per patients per gene + tCount = count(mut, vars=c("Hugo_Symbol","Tumor_Sample_Barcode")) + + #Convert counts to matrix + tCount = spMatrix(length(ids), length(protNames), + match(tCount$Tumor_Sample_Barcode, ids), match(tCount$Hugo_Symbol, protNames), + tCount$freq) + + tCount + + }, error = function(err) { + print("Switching to slow read, either flag usePackages is set to FALSE or could not find one of the following columns: Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode") + ids = c() + tCount = c() + for(line in readLines(mafFile)){ + #find the protein name + split = strsplit(line, "\t")[[1]] + protInd = match(split[1], protNames) + + #check if protein name is in list + if(is.na(protInd)){next} + + #check mutation type + if(!any(sapply(mutTypes, function(x) grepl(x,line)))){next} + + #find sample id and add to list + id = split[grepl("TCGA", split)][1] + idInd = match(id, ids) + if(is.na(idInd)){ + ids = c(ids, id) + idInd = length(ids) + tCount = rbind(tCount, rep(0, length(protNames))) + } + + #add to count for protein/sample pair + tCount[idInd, protInd] = tCount[idInd, protInd]+1 + } + return(tCount) + }) + }