fread update

Updated with option to use R packages to speed up file read time
Singh-Lab · Sep 5, 2017 · 2104cef · 2104cef
1 parent 12be873
commit 2104cef
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 39 deletions.
diff --git a/DifferentialMutationAnalysis.R b/DifferentialMutationAnalysis.R
@@ -23,6 +23,9 @@
 # 
 # DifferentialMutationAnalysis("Data/BRCA_sample.maf", p=0)
 #
+# If you have the following three R packages: "data.table", "plyr", and
+# "Matrix" you can set the flag usePackages to TRUE to significantly decrease
+# file read time
 ###############################################################################
 
 #helper function to parse MAF file
@@ -47,13 +50,13 @@ bins = function(v, p=100) {
 
 #compute unidirectional EMD between mutationas and variations
 uEMD = function(tBins, nBins, p=100) {
-    sum = 0
-    move = 0
-    for(i in p:2){
-        move = move+tBins[i]-nBins[i]
-        sum = sum+max(move,0)
-    }
-    sum
+  sum = 0
+  move = 0
+  for(i in p:2){
+    move = move+tBins[i]-nBins[i]
+    sum = sum+max(move,0)
+  }
+  sum
 }
 
 #generate random uEMDs to compute FDRs
@@ -80,23 +83,35 @@ computeQ = function(FDRs, uEMDscores) {
 DifferentialMutationAnalysis = function(mafFile, geneType="all", p=5,
                                         outDir = "Output/", 
                                         protFile = "Data/protNames.txt", 
-                                        natBinFile = "Data/natDistBinned.txt") {
+                                        natBinFile = "Data/natDistBinned.txt",
+                                        usePackages = FALSE) {
+
+  if(usePackages){
+    library("data.table", quietly=TRUE)
+    library("plyr", quietly=TRUE)
+    library("Matrix", quietly=TRUE)
+  }
 
   #A list of protein names
   protNames = read.table(protFile, stringsAsFactors=FALSE)$V1
 
   #load ranked binned natural variation count data
-  nRankBinned = read.table(natBinFile)
+  if(!usePackages){
+    nRankBinned = read.table(natBinFile)
+  }
+  else{
+    nRankBinned = fread(natBinFile, data.table=FALSE)
+  }
 
   #determine if we want to find all cancer genes or just oncogenes or TSGs
   if(geneType=="onco"){
-    tCount = parseMaf(protNames, mafFile, "Missense_Mutation")
+    tCount = parseMaf(protNames, mafFile, usePackages, "Missense_Mutation")
   }
   if(geneType=="TSG"){
-    tCount = parseMaf(protNames, mafFile, "Nonsense_Mutation")
+    tCount = parseMaf(protNames, mafFile, usePackages, "Nonsense_Mutation")
   }
   else{
-    tCount = parseMaf(protNames, mafFile)
+    tCount = parseMaf(protNames, mafFile, usePackages)
   }
 
   #rank normalize mutations

diff --git a/parseMaf.R b/parseMaf.R
@@ -7,31 +7,62 @@
 # 2. some gene names are poorly annotated
 #This simple parser does not address these issues
 
-parseMaf = function(protNames, mafFile, mutTypes=c("Missense_Mutation", "Nonsense_Mutation")) {
-	ids = c()
-	tCount = c()
-	for(line in readLines(mafFile)){
-	    #find the protein name
-	    split = strsplit(line, "\t")[[1]]
-	    protInd = match(split[1], protNames) 
-
-	     #check if protein name is in list
-	    if(is.na(protInd)){next}
-
-	    #check mutation type
-	    if(!any(sapply(mutTypes, function(x) grepl(x,line)))){next} 
-
-	    #find sample id and add to list
-	    id = split[grepl("TCGA", split)][1] 
-	    idInd = match(id, ids)
-	    if(is.na(idInd)){
-	        ids = c(ids, id)
-	        idInd = length(ids)
-	        tCount = rbind(tCount, rep(0, length(protNames)))
-	    }
-
-	    #add to count for protein/sample pair
-	    tCount[idInd, protInd] = tCount[idInd, protInd]+1
-	}
-	tCount
+parseMaf = function(protNames, mafFile, usePackages, mutTypes=c("Missense_Mutation", "Nonsense_Mutation")) {
+
+  tryCatch({
+
+    if(!usePackages){stop("usePackages")}
+
+    #Read maf file skipping comment lines
+    mut = fread(mafFile, skip="Hugo_Symbol", header=TRUE, fill = TRUE)
+
+    #Trim proteins and mutation types
+    mut = mut[mut$Hugo_Symbol %in% protNames & mut$Variant_Classification %in% mutTypes,]
+
+    #If ids are TCGA barcodes, trim them for uniqueness
+    if(grepl("TCGA", mut$Tumor_Sample_Barcode[1])){
+      mut$Tumor_Sample_Barcode = sapply(mut$Tumor_Sample_Barcode, function(x) paste(strsplit(x, "-")[[1]][1:4], collapse="-"))
+    }
+    ids = unique(mut$Tumor_Sample_Barcode)
+
+    #Count mutations per patients per gene
+    tCount = count(mut, vars=c("Hugo_Symbol","Tumor_Sample_Barcode"))
+
+    #Convert counts to matrix
+    tCount =  spMatrix(length(ids), length(protNames),
+                       match(tCount$Tumor_Sample_Barcode, ids), match(tCount$Hugo_Symbol, protNames),
+                       tCount$freq)
+
+    tCount
+
+  }, error = function(err) {
+    print("Switching to slow read, either flag usePackages is set to FALSE or could not find one of the following columns: Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode")
+    ids = c()
+    tCount = c()
+    for(line in readLines(mafFile)){
+      #find the protein name
+      split = strsplit(line, "\t")[[1]]
+      protInd = match(split[1], protNames) 
+
+       #check if protein name is in list
+      if(is.na(protInd)){next}
+
+      #check mutation type
+      if(!any(sapply(mutTypes, function(x) grepl(x,line)))){next} 
+
+      #find sample id and add to list
+      id = split[grepl("TCGA", split)][1] 
+      idInd = match(id, ids)
+      if(is.na(idInd)){
+        ids = c(ids, id)
+        idInd = length(ids)
+        tCount = rbind(tCount, rep(0, length(protNames)))
+      }
+
+      #add to count for protein/sample pair
+      tCount[idInd, protInd] = tCount[idInd, protInd]+1
+    }
+    return(tCount)
+  })
+
 }