# Holdout Method

In [3]:
# install.packages("caret")
library(caret)

Loading required package: ggplot2

Loading required package: lattice

“running command 'timedatectl' had status 1”


In [5]:
# importing credit dataset
credit<- read.csv("Credit_Data.csv", stringsAsFactors = T)
str(credit)

'data.frame':	1000 obs. of  21 variables:
 $ default                   : int  0 1 0 0 1 0 0 0 0 1 ...
 $ account_check_status      : Factor w/ 4 levels "< 0 DM",">= 200 DM / salary assignments for at least 1 year",..: 1 3 4 1 1 4 4 3 4 3 ...
 $ duration_in_month         : int  6 48 12 42 24 36 24 36 12 30 ...
 $ credit_history            : Factor w/ 5 levels "all credits at this bank paid back duly",..: 2 4 2 4 3 4 4 4 4 2 ...
 $ purpose                   : Factor w/ 10 levels "(vacation - does not exist?)",..: 5 5 1 8 3 1 8 4 5 3 ...
 $ credit_amount             : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
 $ savings                   : Factor w/ 5 levels ".. >= 1000 DM ",..: 5 2 2 2 2 5 4 2 1 2 ...
 $ present_emp_since         : Factor w/ 5 levels ".. >= 7 years",..: 1 3 4 4 3 3 1 3 4 5 ...
 $ installment_as_income_perc: int  4 2 2 2 3 2 3 2 2 4 ...
 $ personal_status_sex       : Factor w/ 4 levels "female : divorced/separated/married",..: 4 1 4 4 4 4 4 4 2 3 ...
 $ o

In [6]:
# clean the data
levels(credit$savings)<-make.names(levels(credit$savings))
levels(credit$account_check_status)<-make.names(levels(credit$account_check_status))
levels(credit$present_emp_since)<-make.names(levels(credit$present_emp_since))
levels(credit$personal_status_sex)<-make.names(levels(credit$personal_status_sex))
levels(credit$property)<-make.names(levels(credit$property))
levels(credit$credit_history)<-make.names(levels(credit$credit_history))
levels(credit$foreign_worker)<-make.names(levels(credit$foreign_worker))
levels(credit$housing)<-make.names(levels(credit$housing))
levels(credit$job)<-make.names(levels(credit$job))
levels(credit$other_debtors)<-make.names(levels(credit$other_debtors))
levels(credit$other_installment_plans)<-make.names(levels(credit$other_installment_plans))
levels(credit$purpose)<-make.names(levels(credit$purpose))
levels(credit$telephone)<-make.names(levels(credit$telephone))

In [7]:
# we divide the data into three partitions: train, validation, and test data
# randomizing the data
rand<-order(runif(1000))
credit_train<-credit[rand[1:500],]
credit_validate<- credit[rand[501:750],]
credit_test<- credit[rand[751:1000],]

In [10]:
prop.table(table(credit_train$default))
prop.table(table(credit_validate$default))
prop.table(table(credit_test$default))


    0     1 
0.696 0.304 


    0     1 
0.704 0.296 


    0     1 
0.704 0.296 

In [11]:
# Using createDataPartition() from the caret package to create 
# partition based on stratified sampling
in_train<- createDataPartition(credit$default, p=0.75, list = FALSE)
credit_train<- credit[in_train,]
credit_test<- credit[-in_train,]

In [12]:
prop.table(table(credit_train$default))
prop.table(table(credit_test$default))


        0         1 
0.7106667 0.2893333 


    0     1 
0.668 0.332 

# Cross validation

In [13]:
# creating cross-validation data using createFold() with 10 folds
fold<- createFolds(credit$default, k = 10)
str(fold)

List of 10
 $ Fold01: int [1:100] 3 11 13 16 45 51 55 71 91 103 ...
 $ Fold02: int [1:100] 9 28 48 69 100 101 104 123 129 135 ...
 $ Fold03: int [1:100] 14 15 19 26 31 32 40 44 49 52 ...
 $ Fold04: int [1:100] 4 12 17 18 33 38 50 57 62 76 ...
 $ Fold05: int [1:100] 1 5 8 20 35 36 37 39 42 53 ...
 $ Fold06: int [1:100] 6 21 61 83 89 115 116 117 127 128 ...
 $ Fold07: int [1:100] 7 22 23 34 43 54 65 75 80 82 ...
 $ Fold08: int [1:100] 2 24 25 29 41 46 68 74 90 106 ...
 $ Fold09: int [1:100] 10 27 47 58 60 66 79 86 99 121 ...
 $ Fold10: int [1:100] 30 56 63 72 77 81 87 88 97 107 ...


In [14]:
# assigning the sampled dataset in training and test data
credit01_train<- credit[fold$Fold01,]
credit01_test<- credit[-fold$Fold01,]

In [15]:
str(credit01_train)
str(credit01_test)

'data.frame':	100 obs. of  21 variables:
 $ default                   : int  0 1 0 1 1 0 1 0 0 0 ...
 $ account_check_status      : Factor w/ 4 levels "X..0.DM","X...200.DM...salary.assignments.for.at.least.1.year",..: 4 3 3 1 1 3 3 4 4 4 ...
 $ duration_in_month         : int  12 12 12 24 48 24 36 36 12 6 ...
 $ credit_history            : Factor w/ 5 levels "all.credits.at.this.bank.paid.back.duly",..: 2 4 4 4 2 3 3 4 2 3 ...
 $ purpose                   : Factor w/ 10 levels "X.vacation...does.not.exist..",..: 1 3 5 5 4 8 3 4 5 5 ...
 $ credit_amount             : int  2096 1295 1567 1282 6143 2333 2225 8133 618 932 ...
 $ savings                   : Factor w/ 5 levels "......1000.DM.",..: 2 2 2 3 2 5 2 2 2 2 ...
 $ present_emp_since         : Factor w/ 5 levels "......7.years",..: 4 2 3 3 1 2 1 3 1 3 ...
 $ installment_as_income_perc: int  2 3 1 4 4 4 4 1 4 3 ...
 $ personal_status_sex       : Factor w/ 4 levels "female...divorced.separated.married",..: 4 1 1 1 1 4 4 1 4 1 ...
 $ o

In [19]:
# Performing a full 10-fold CV automatically
# install.packages("C50")
# install.packages("irr") #for kappa()
library(C50)
library(irr)

Loading required package: lpSolve



In [20]:
# creating the 10 fold
set.seed(123)
folds<-createFolds(credit$default, k = 10)


In [21]:
credit$default<-factor(credit$default)

In [24]:
# Defining our own function to pass the lapply function
cv_results<- lapply(folds, function(x){
  credit_train <- credit[x,]
  credit_test <- credit[-x,]
  credit_model <- C5.0(default ~ ., data = credit_train)
  credit_pred <- predict(credit_model, credit_test)
  credit_actual <- credit_test$default
  kappa <- kappa2(data.frame(credit_actual, credit_pred))$value
  return(kappa)
})

In [25]:
# examining the kappa statistics
str(cv_results)

List of 10
 $ Fold01: num 0.127
 $ Fold02: num 0.0595
 $ Fold03: num 0.138
 $ Fold04: num 0.242
 $ Fold05: num 0.111
 $ Fold06: num 0.138
 $ Fold07: num 0.0678
 $ Fold08: num 0.228
 $ Fold09: num 0.0811
 $ Fold10: num 0.19


In [26]:
# calculating the average of the 10 values
mean(unlist(cv_results))

The kappa statistics is fairly low. It corresponds to "poor"
on the interpretation scale—which suggests that the credit scoring model does
not perform much better than random chance. 

# Bootstrap Sampling

This is an alternative to the k-fold CV