Skip to content

Commit

Permalink
fread update
Browse files Browse the repository at this point in the history
Updated with option to use R packages to speed up file read time
  • Loading branch information
PFPrzytycki committed Sep 5, 2017
1 parent 12be873 commit 2104cef
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 39 deletions.
39 changes: 27 additions & 12 deletions DifferentialMutationAnalysis.R
Expand Up @@ -23,6 +23,9 @@
#
# DifferentialMutationAnalysis("Data/BRCA_sample.maf", p=0)
#
# If you have the following three R packages: "data.table", "plyr", and
# "Matrix" you can set the flag usePackages to TRUE to significantly decrease
# file read time
###############################################################################

#helper function to parse MAF file
Expand All @@ -47,13 +50,13 @@ bins = function(v, p=100) {

#compute unidirectional EMD between mutationas and variations
uEMD = function(tBins, nBins, p=100) {
sum = 0
move = 0
for(i in p:2){
move = move+tBins[i]-nBins[i]
sum = sum+max(move,0)
}
sum
sum = 0
move = 0
for(i in p:2){
move = move+tBins[i]-nBins[i]
sum = sum+max(move,0)
}
sum
}

#generate random uEMDs to compute FDRs
Expand All @@ -80,23 +83,35 @@ computeQ = function(FDRs, uEMDscores) {
DifferentialMutationAnalysis = function(mafFile, geneType="all", p=5,
outDir = "Output/",
protFile = "Data/protNames.txt",
natBinFile = "Data/natDistBinned.txt") {
natBinFile = "Data/natDistBinned.txt",
usePackages = FALSE) {

if(usePackages){
library("data.table", quietly=TRUE)
library("plyr", quietly=TRUE)
library("Matrix", quietly=TRUE)
}

#A list of protein names
protNames = read.table(protFile, stringsAsFactors=FALSE)$V1

#load ranked binned natural variation count data
nRankBinned = read.table(natBinFile)
if(!usePackages){
nRankBinned = read.table(natBinFile)
}
else{
nRankBinned = fread(natBinFile, data.table=FALSE)
}

#determine if we want to find all cancer genes or just oncogenes or TSGs
if(geneType=="onco"){
tCount = parseMaf(protNames, mafFile, "Missense_Mutation")
tCount = parseMaf(protNames, mafFile, usePackages, "Missense_Mutation")
}
if(geneType=="TSG"){
tCount = parseMaf(protNames, mafFile, "Nonsense_Mutation")
tCount = parseMaf(protNames, mafFile, usePackages, "Nonsense_Mutation")
}
else{
tCount = parseMaf(protNames, mafFile)
tCount = parseMaf(protNames, mafFile, usePackages)
}

#rank normalize mutations
Expand Down
85 changes: 58 additions & 27 deletions parseMaf.R
Expand Up @@ -7,31 +7,62 @@
# 2. some gene names are poorly annotated
#This simple parser does not address these issues

parseMaf = function(protNames, mafFile, mutTypes=c("Missense_Mutation", "Nonsense_Mutation")) {
ids = c()
tCount = c()
for(line in readLines(mafFile)){
#find the protein name
split = strsplit(line, "\t")[[1]]
protInd = match(split[1], protNames)

#check if protein name is in list
if(is.na(protInd)){next}

#check mutation type
if(!any(sapply(mutTypes, function(x) grepl(x,line)))){next}

#find sample id and add to list
id = split[grepl("TCGA", split)][1]
idInd = match(id, ids)
if(is.na(idInd)){
ids = c(ids, id)
idInd = length(ids)
tCount = rbind(tCount, rep(0, length(protNames)))
}

#add to count for protein/sample pair
tCount[idInd, protInd] = tCount[idInd, protInd]+1
}
tCount
parseMaf = function(protNames, mafFile, usePackages, mutTypes=c("Missense_Mutation", "Nonsense_Mutation")) {

tryCatch({

if(!usePackages){stop("usePackages")}

#Read maf file skipping comment lines
mut = fread(mafFile, skip="Hugo_Symbol", header=TRUE, fill = TRUE)

#Trim proteins and mutation types
mut = mut[mut$Hugo_Symbol %in% protNames & mut$Variant_Classification %in% mutTypes,]

#If ids are TCGA barcodes, trim them for uniqueness
if(grepl("TCGA", mut$Tumor_Sample_Barcode[1])){
mut$Tumor_Sample_Barcode = sapply(mut$Tumor_Sample_Barcode, function(x) paste(strsplit(x, "-")[[1]][1:4], collapse="-"))
}
ids = unique(mut$Tumor_Sample_Barcode)

#Count mutations per patients per gene
tCount = count(mut, vars=c("Hugo_Symbol","Tumor_Sample_Barcode"))

#Convert counts to matrix
tCount = spMatrix(length(ids), length(protNames),
match(tCount$Tumor_Sample_Barcode, ids), match(tCount$Hugo_Symbol, protNames),
tCount$freq)

tCount

}, error = function(err) {
print("Switching to slow read, either flag usePackages is set to FALSE or could not find one of the following columns: Hugo_Symbol, Variant_Classification, Tumor_Sample_Barcode")
ids = c()
tCount = c()
for(line in readLines(mafFile)){
#find the protein name
split = strsplit(line, "\t")[[1]]
protInd = match(split[1], protNames)

#check if protein name is in list
if(is.na(protInd)){next}

#check mutation type
if(!any(sapply(mutTypes, function(x) grepl(x,line)))){next}

#find sample id and add to list
id = split[grepl("TCGA", split)][1]
idInd = match(id, ids)
if(is.na(idInd)){
ids = c(ids, id)
idInd = length(ids)
tCount = rbind(tCount, rep(0, length(protNames)))
}

#add to count for protein/sample pair
tCount[idInd, protInd] = tCount[idInd, protInd]+1
}
return(tCount)
})

}

0 comments on commit 2104cef

Please sign in to comment.