In [1]:
library(ggplot2)
library(lattice)
library(class)
library(gmodels)
library(stats)
library(clue)
library(e1071)
library(caret)

In [2]:
data <- read.csv(file="./wdbc.csv", header=TRUE, sep=",", dec=".", stringsAsFactors=FALSE)
data <- subset(data, select=-id)
data$diagnosis <- factor(data$diagnosis, levels = c("B", "M"), labels = c("Benign", "Malignant"))
head(data)

diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,points_mean,symmetry_mean,⋯,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,points_worst,symmetry_worst,dimension_worst
Benign,12.32,12.39,78.85,464.1,0.1028,0.06981,0.03987,0.037,0.1959,⋯,13.5,15.64,86.97,549.1,0.1385,0.1266,0.1242,0.09391,0.2827,0.06771
Benign,10.6,18.95,69.28,346.4,0.09688,0.1147,0.06387,0.02642,0.1922,⋯,11.88,22.94,78.28,424.8,0.1213,0.2515,0.1916,0.07926,0.294,0.07587
Benign,11.04,16.83,70.92,373.2,0.1077,0.07804,0.03046,0.0248,0.1714,⋯,12.41,26.44,79.93,471.4,0.1369,0.1482,0.1067,0.07431,0.2998,0.07881
Benign,11.28,13.39,73.0,384.8,0.1164,0.1136,0.04635,0.04796,0.1771,⋯,11.92,15.77,76.53,434.0,0.1367,0.1822,0.08669,0.08611,0.2102,0.06784
Benign,15.19,13.21,97.65,711.8,0.07963,0.06934,0.03393,0.02657,0.1721,⋯,16.2,15.73,104.5,819.1,0.1126,0.1737,0.1362,0.08178,0.2487,0.06766
Benign,11.57,19.04,74.2,409.7,0.08546,0.07722,0.05485,0.01428,0.2031,⋯,13.07,26.98,86.43,520.5,0.1249,0.1937,0.256,0.06664,0.3035,0.08284


In [3]:
data_shuffled <- data[sample(nrow(data), replace = FALSE), ]
rownames(data_shuffled) <- seq(length=nrow(data_shuffled))
ncol(data)
data_shuffled <-data_shuffled[c(1:ncol(data))]

In [4]:
## kfolds k-nn
kfolds <- function(data, k, folds=10){
    folds_vec <- cut(c(1:nrow(data)), folds, labels=FALSE)

    res <- c()
    fneg <- c()
    for(i in 1:folds){
        #split dataset
        testing <- data[which(folds_vec==i,  arr.ind=TRUE),]
        training <- data[-which(folds_vec==i,  arr.ind=TRUE),]
        
        #normalize dataset in good way
        minMaxScaler <- caret::preProcess(training, method = "range")
        testing <- predict(minMaxScaler, testing)
        training  <- predict(minMaxScaler, training)
        
        #predict
        test_pred <- knn(training[,-1], testing[,-1], training[,1], k=k)
        # print(mean(test_pred == factor(testing[,ncol(data)])))
        res <- c(res, mean(test_pred == (testing[,1])))
        CrossTable(x=test_pred, y=testing[,1], prop.chisq=FALSE)
        #print(class(test_pred))
        #fneg <- c(fneg, sum((test_pred==as.factor("Bening"))))
    }
    print(fneg)
    return(mean(res))
}

In [5]:
kfolds(data_shuffled, 21)


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  57 

 
             | testing[, 1] 
   test_pred |    Benign | Malignant | Row Total | 
-------------|-----------|-----------|-----------|
      Benign |        34 |         1 |        35 | 
             |     0.971 |     0.029 |     0.614 | 
             |     0.971 |     0.045 |           | 
             |     0.596 |     0.018 |           | 
-------------|-----------|-----------|-----------|
   Malignant |         1 |        21 |        22 | 
             |     0.045 |     0.955 |     0.386 | 
             |     0.029 |     0.955 |           | 
             |     0.018 |     0.368 |           | 
-------------|-----------|-----------|-----------|
Column Total |        35 |        22 |        57 | 
             |     0.614 |     0.386 |           | 
-------------|----

In [6]:
# kfolds svm
kfolds_svm <- function(data, folds=10){
    folds_vec <- cut(c(1:nrow(data)), folds, labels=FALSE)

    res <- c()
    for(i in 1:folds){
        #split dataset
        testing <- data[which(folds_vec==i,  arr.ind=TRUE),]
        training <- data[-which(folds_vec==i,  arr.ind=TRUE),]
        
        #normalize dataset in good way
        minMaxScaler <- caret::preProcess(training, method = "range")
        testing <- predict(minMaxScaler, testing)
        training  <- predict(minMaxScaler, training)
        
        #predict
        model <- e1071::svm(diagnosis~.,data=training, kernel="linear",scale=FALSE)
        #model <- e1071::svm(diagnosis~.,data=training, kernel="polynomial",degree=1,scale=FALSE) #bad
        #model <- e1071::svm(diagnosis~.,data=training, kernel="sigmoid",scale=FALSE,coef0=0.001,gamma=0.075)
        #model <- e1071::svm(diagnosis~.,data=training, kernel="radial",scale=FALSE,coef0=0.001,gamma=0.075)
        test_pred <- predict(model, testing)
        CrossTable(x=test_pred, y=testing[,1], prop.chisq=FALSE)
        res <- c(res, mean(test_pred == (testing[,1])))
    }
    return(mean(res))
}
kfolds_svm(data_shuffled)


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  57 

 
             | testing[, 1] 
   test_pred |    Benign | Malignant | Row Total | 
-------------|-----------|-----------|-----------|
      Benign |        33 |         1 |        34 | 
             |     0.971 |     0.029 |     0.596 | 
             |     0.943 |     0.045 |           | 
             |     0.579 |     0.018 |           | 
-------------|-----------|-----------|-----------|
   Malignant |         2 |        21 |        23 | 
             |     0.087 |     0.913 |     0.404 | 
             |     0.057 |     0.955 |           | 
             |     0.035 |     0.368 |           | 
-------------|-----------|-----------|-----------|
Column Total |        35 |        22 |        57 | 
             |     0.614 |     0.386 |           | 
-------------|----

In [7]:
# best with linear kernel
kfolds_svm(data_shuffled)


 
   Cell Contents
|-------------------------|
|                       N |
|           N / Row Total |
|           N / Col Total |
|         N / Table Total |
|-------------------------|

 
Total Observations in Table:  57 

 
             | testing[, 1] 
   test_pred |    Benign | Malignant | Row Total | 
-------------|-----------|-----------|-----------|
      Benign |        33 |         1 |        34 | 
             |     0.971 |     0.029 |     0.596 | 
             |     0.943 |     0.045 |           | 
             |     0.579 |     0.018 |           | 
-------------|-----------|-----------|-----------|
   Malignant |         2 |        21 |        23 | 
             |     0.087 |     0.913 |     0.404 | 
             |     0.057 |     0.955 |           | 
             |     0.035 |     0.368 |           | 
-------------|-----------|-----------|-----------|
Column Total |        35 |        22 |        57 | 
             |     0.614 |     0.386 |           | 
-------------|----