In [1]:
# Sys.setlocale("LC_TIME", "English")
# library(parallel)
# setDefaultCluster(makeCluster(4))
# source('CreateGrantData.R')
# save(training,testing,pre2008,fullSet,reducedSet,file = "grant_Data.Rdata")
load("grant_Data.Rdata")

In [2]:
library(caret)
library(doMC)
registerDoMC(4)
library(plyr)
library(reshape2)

Loading required package: lattice
Loading required package: ggplot2
Loading required package: foreach
Loading required package: iterators
Loading required package: parallel


In [3]:
## Look at two different ways to split and resample the data. 
pre2008Data <- training[pre2008,]
year2008Data <- rbind(training[-pre2008,], testing)

set.seed(552)
test2008 <- createDataPartition(year2008Data$Class, p = .25)[[1]]

allData <- rbind(pre2008Data, year2008Data[-test2008,])
holdout2008 <- year2008Data[test2008,]

In [13]:
#remove zear var cols first before scaling
ZeroVarCol <- c("Sponsor260D","Sponsor281A" , "Sponsor315C")
fullSet <- fullSet[! fullSet %in% ZeroVarCol]

In [16]:
nzv_info <- nearZeroVar(pre2008Data[,fullSet],saveMetrics = T)
head(nzv_info)

Unnamed: 0,freqRatio,percentUnique,zeroVar,nzv
NumCI,3.232558,0.18091361,False,False
NumDR,28.462222,0.0452284,False,True
NumECI,4.470483,0.18091361,False,False
NumEA,1657.25,0.03015227,False,True
NumHV,330.6,0.0452284,False,True
NumPS,16.21039,0.0452284,False,False


In [None]:
## Use a common tuning grid for both approaches. 
svmrGrid <- expand.grid(sigma = c(.00007, .00009, .0001, .0002),
                        C = 2^(-3:8))

## Evaluate the model using overall 10-fold cross-validation
ctrl0 <- trainControl(
                      method = "cv",number=5,
                      search = "random",
                      summaryFunction = twoClassSummary,
                      classProbs = TRUE)
set.seed(914)
svmFit0 <- train(pre2008Data[,reducedSet], pre2008Data$Class,
                 method = "svmRadial",
                 tuneGrid = svmrGrid,
                 preProc = c("center", "scale"),
                 metric = "ROC",
                 trControl = ctrl0)
svmFit0

In [None]:
### Now fit the single 2008 test set
ctrl00 <- trainControl(method = "LGOCV",
                       summaryFunction = twoClassSummary,
                       classProbs = TRUE,
                       index = list(TestSet = 1:nrow(pre2008Data)))


set.seed(914)
svmFit00 <- train(allData[,reducedSet], allData$Class,
                  method = "svmRadial",
                  tuneGrid = svmrGrid,
                  preProc = c("center", "scale"),
                  metric = "ROC",
                  trControl = ctrl00)
svmFit00

In [None]:
## Combine the two sets of results and plot

grid0 <- subset(svmFit0$bestTune$Model <- "10-Fold Cross-Validation"

grid00 <- subset(svmFit00$bestTune$Model <- "Single 2008 Test Set"

plotData <- rbind(grid00, grid0)

plotData <- plotData[!is.na(plotData$ROC),]
xyplot(ROC ~ C, data = plotData,
       groups = Model,
       type = c("g", "o"),
       scales = list(x = list(log = 2)),
       auto.key = list(columns = 1))

```r
### Section 12.1 Case Study: Predicting Successful Grant Applications

load("grantData.RData")

library(caret)
library(doMC)
registerDoMC(12)
library(plyr)
library(reshape2)

## Look at two different ways to split and resample the data. A support vector
## machine is used to illustrate the differences. The full set of predictors
## is used. 

pre2008Data <- training[pre2008,]
year2008Data <- rbind(training[-pre2008,], testing)

set.seed(552)
test2008 <- createDataPartition(year2008Data$Class, p = .25)[[1]]

allData <- rbind(pre2008Data, year2008Data[-test2008,])
holdout2008 <- year2008Data[test2008,]

## Use a common tuning grid for both approaches. 
svmrGrid <- expand.grid(sigma = c(.00007, .00009, .0001, .0002),
                        C = 2^(-3:8))

## Evaluate the model using overall 10-fold cross-validation
ctrl0 <- trainControl(method = "cv",
                      summaryFunction = twoClassSummary,
                      classProbs = TRUE)
set.seed(477)
svmFit0 <- train(pre2008Data[,fullSet], pre2008Data$Class,
                 method = "svmRadial",
                 tuneGrid = svmrGrid,
                 preProc = c("center", "scale"),
                 metric = "ROC",
                 trControl = ctrl0)
svmFit0

### Now fit the single 2008 test set
ctrl00 <- trainControl(method = "LGOCV",
                       summaryFunction = twoClassSummary,
                       classProbs = TRUE,
                       index = list(TestSet = 1:nrow(pre2008Data)))


set.seed(476)
svmFit00 <- train(allData[,fullSet], allData$Class,
                  method = "svmRadial",
                  tuneGrid = svmrGrid,
                  preProc = c("center", "scale"),
                  metric = "ROC",
                  trControl = ctrl00)
svmFit00

## Combine the two sets of results and plot

grid0 <- subset(svmFit0$results,  sigma == svmFit0$bestTune$sigma)
grid0$Model <- "10-Fold Cross-Validation"

grid00 <- subset(svmFit00$results,  sigma == svmFit00$bestTune$sigma)
grid00$Model <- "Single 2008 Test Set"

plotData <- rbind(grid00, grid0)

plotData <- plotData[!is.na(plotData$ROC),]
xyplot(ROC ~ C, data = plotData,
       groups = Model,
       type = c("g", "o"),
       scales = list(x = list(log = 2)),
       auto.key = list(columns = 1))

```