In [2]:
library(tidyverse)
library(dplyr)
library(rpart)
source('functions.R')
library(rpart)

# 05c Trees
I begin by training the model on the entire dataset

In [3]:
train = readRDS("04a-wrangledTrain.rds")
holdout = readRDS('04b-wrangledHoldout.rds')

In [4]:
# This function trains a tree and returns the model
myTrain <- function(datafr){
    mod = rpart(log(price)~ . , data=datafr)
    return(mod)
    }

In [5]:
#' @description
#' Find group position for each element of a vector
#'
#' @param values vector which should values that depends on the group
#' @param groupValues group values in the vector (hopefully unique)
#' @param tolerance for matching equality
#'
#' @return group vector with membership label for each element of 'values' #'
FindUniquePos = function(values, groupValues, tolerance=1.e-5){
    ngroup = length(groupValues) # number of groups (terminal nodes)
    temp = unique(groupValues)
    if(length(temp)<ngroup){
        cat("Won't work: non-unique group values\n"); return(0); }
    npred = length(values) # number of cases to bin into a group label group = rep(0,npred) # initialize as group 0
    group = rep(0,npred)
    for(ig in 1:ngroup){
        # group[values==groupValues[i]]=i # better to use tolerance
        igroup = (abs(values-groupValues[ig])<tolerance)
        group[igroup] = ig  # group label according to position in groupValues 
    }
    if( any(group==0) ) cat("Warning: some values not matched to groupValues\n")
    return(group)
    }

The standard kfold function needs to be modified to allow for the interval scores generated by trees.

In [7]:
kFoldTree = function(Kfold, seed, datafr)
{ set.seed(seed)
  n = nrow(datafr)
  iperm<<-sample(n) # set as global for debugging check
  nhold = round(n/Kfold)
  reg = list()
  pred = list() 
  scoreVar = list()
  rocVar = list()
  pred_y = sample(n-nhold)
  results = data.frame(NA,nrow = 3,ncol = 4)
 
  for(k in 1:Kfold){
        ilow = (k-1)*nhold+1
        ihigh = k*nhold
        if(k==Kfold) { ihigh = n }
        ifold = iperm[ilow:ihigh]
        holdo = datafr[ifold,]
        train = datafr[-ifold,]
        RegTree = rpart(log(price)~., data=train)
        meanByTNode = tapply(log(train$price), RegTree$where, mean)
        Q25ByTNode = tapply(log(train$price), RegTree$where, quantile,prob=0.25)
        Q50ByTNode = tapply(log(train$price), RegTree$where, median)
        Q75ByTNode = tapply(log(train$price), RegTree$where, quantile,prob=0.75)
        Q10ByTNode = tapply(log(train$price), RegTree$where, quantile, prob=0.10)
        Q90ByTNode = tapply(log(train$price), RegTree$where, quantile, prob=0.90)
        meanpredRegTree = predict(RegTree, newdata=holdo,type="vector")
        TNodeGroup = FindUniquePos(meanpredRegTree,meanByTNode)
      
        TNodeGroup = FindUniquePos(meanpredRegTree,meanByTNode)
      
        Q25predRegTree = Q25ByTNode[TNodeGroup]; Q75predRegTree = Q75ByTNode[TNodeGroup]
        pred50IntRegTree = exp(cbind(meanpredRegTree,Q25predRegTree,Q75predRegTree))
      
        Q10predRegTree = Q10ByTNode[TNodeGroup]; Q90predRegTree = Q90ByTNode[TNodeGroup] 
        pred80IntRegTree = exp(cbind(meanpredRegTree,Q10predRegTree,Q90predRegTree))
          
        ISTree50 = intervalScore(pred50IntRegTree,holdo$price,0.5)
        ISTree80 = intervalScore(pred80IntRegTree,holdo$price,0.8)
        outTree = rbind(ISTree50$summary,ISTree80$summary)
        colnames(outTree)=c("level","avgleng","IS","cover") 
        print(outTree)

  }
}

In [8]:
kFoldTree(3,123,datafr = train)

     level   avgleng       IS     cover
[1,]   0.5  9477.828 21119.11 0.5004229
[2,]   0.8 20321.567 31628.91 0.7993725
     level   avgleng       IS     cover
[1,]   0.5  9795.193 21610.06 0.4985774
[2,]   0.8 20545.379 32405.93 0.8009874
     level   avgleng       IS     cover
[1,]   0.5  9473.537 21142.21 0.4991541
[2,]   0.8 19925.459 31877.84 0.7981760


In [9]:
train = feature_selection(train)

In [10]:
kFoldTree(3,123,datafr = train)

     level   avgleng       IS     cover
[1,]   0.5  9381.189 21160.99 0.5011150
[2,]   0.8 19972.221 32070.82 0.8004798
     level   avgleng       IS     cover
[1,]   0.5  9180.627 21001.34 0.4973009
[2,]   0.8 19872.041 31655.98 0.8018025
     level   avgleng       IS     cover
[1,]   0.5  9461.281 21262.50 0.5000308
[2,]   0.8 20003.349 32159.89 0.7985297


In [20]:
holdo = holdout
train = readRDS("04a-wrangledTrain.rds")
RegTree = rpart(log(price)~., data=train)
meanByTNode = tapply(log(train$price), RegTree$where, mean)
Q25ByTNode = tapply(log(train$price), RegTree$where, quantile,prob=0.25)
Q50ByTNode = tapply(log(train$price), RegTree$where, median)
Q75ByTNode = tapply(log(train$price), RegTree$where, quantile,prob=0.75)
Q10ByTNode = tapply(log(train$price), RegTree$where, quantile, prob=0.10)
Q90ByTNode = tapply(log(train$price), RegTree$where, quantile, prob=0.90)
meanpredRegTree = predict(RegTree, newdata=holdo,type="vector")
TNodeGroup = FindUniquePos(meanpredRegTree,meanByTNode)
      
TNodeGroup = FindUniquePos(meanpredRegTree,meanByTNode)
      
Q25predRegTree = Q25ByTNode[TNodeGroup]; Q75predRegTree = Q75ByTNode[TNodeGroup]
pred50IntRegTree = exp(cbind(meanpredRegTree,Q25predRegTree,Q75predRegTree))
      
Q10predRegTree = Q10ByTNode[TNodeGroup]; Q90predRegTree = Q90ByTNode[TNodeGroup] 
pred80IntRegTree = exp(cbind(meanpredRegTree,Q10predRegTree,Q90predRegTree))
          
ISTree50 = intervalScore(pred50IntRegTree,holdo$price,0.5)
ISTree80 = intervalScore(pred80IntRegTree,holdo$price,0.8)
outTree = rbind(ISTree50$summary,ISTree80$summary)
colnames(outTree)=c("level","avgleng","IS","cover") 
print(outTree)

     level   avgleng       IS     cover
[1,]   0.5  9386.014 21003.94 0.4994053
[2,]   0.8 19889.704 31773.74 0.8009884


In [16]:
train = feature_selection(train)
RegTree = rpart(log(price)~., data=train)
meanByTNode = tapply(log(train$price), RegTree$where, mean)
Q25ByTNode = tapply(log(train$price), RegTree$where, quantile,prob=0.25)
Q50ByTNode = tapply(log(train$price), RegTree$where, median)
Q75ByTNode = tapply(log(train$price), RegTree$where, quantile,prob=0.75)
Q10ByTNode = tapply(log(train$price), RegTree$where, quantile, prob=0.10)
Q90ByTNode = tapply(log(train$price), RegTree$where, quantile, prob=0.90)
meanpredRegTree = predict(RegTree, newdata=holdo,type="vector")
TNodeGroup = FindUniquePos(meanpredRegTree,meanByTNode)
      
TNodeGroup = FindUniquePos(meanpredRegTree,meanByTNode)
      
Q25predRegTree = Q25ByTNode[TNodeGroup]; Q75predRegTree = Q75ByTNode[TNodeGroup]
pred50IntRegTree = exp(cbind(meanpredRegTree,Q25predRegTree,Q75predRegTree))
      
Q10predRegTree = Q10ByTNode[TNodeGroup]; Q90predRegTree = Q90ByTNode[TNodeGroup] 
pred80IntRegTree = exp(cbind(meanpredRegTree,Q10predRegTree,Q90predRegTree))
          
ISTree50 = intervalScore(pred50IntRegTree,holdo$price,0.5)
ISTree80 = intervalScore(pred80IntRegTree,holdo$price,0.8)
outTree = rbind(ISTree50$summary,ISTree80$summary)
colnames(outTree)=c("level","avgleng","IS","cover") 
print(outTree)

     level   avgleng       IS     cover
[1,]   0.5  9386.014 21003.94 0.4994053
[2,]   0.8 19889.704 31773.74 0.8009884


In [18]:
train = readRDS("04a-wrangledTrain.rds")

In [19]:
train

Unnamed: 0_level_0,price,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,paint_color,state,type,countryOrigin,isLuxury,age
Unnamed: 0_level_1,<dbl>,<fct>,<fct>,<fct>,<fct>,<int>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<fct>,<dbl>,<dbl>
1,33590,gmc,good,8 cylinders,gas,57923,clean,other,,,white,al,pickup,USA,0,8
2,22590,chevrolet,good,8 cylinders,gas,71229,clean,other,,,blue,al,pickup,USA,0,12
3,39590,chevrolet,good,8 cylinders,gas,19160,clean,other,,,red,al,pickup,USA,0,2
4,30990,toyota,good,8 cylinders,gas,41124,clean,other,,,red,al,pickup,Japan,0,5
5,15000,ford,excellent,6 cylinders,gas,128000,clean,automatic,rwd,full-size,black,al,truck,USA,0,9
6,27990,gmc,good,8 cylinders,gas,68696,clean,other,4wd,,black,al,pickup,USA,0,10
7,35000,toyota,excellent,6 cylinders,gas,43000,clean,automatic,4wd,,grey,al,truck,Japan,0,3
8,29990,chevrolet,good,6 cylinders,gas,17302,clean,other,4wd,,red,al,pickup,USA,0,6
9,38590,chevrolet,good,8 cylinders,gas,30237,clean,other,rwd,,red,al,other,USA,0,11
10,32990,jeep,good,6 cylinders,gas,30041,clean,other,4wd,,silver,al,other,USA,0,5
