# KNN分類 (k-nearest neighbors)

In [1]:
library(class)
library(e1071) # for tune.knn

# iris資料集

In [2]:
data(iris)

# 切割資料集
index <- c(sample(1:50,35), sample(51:100,35), sample(101:150,35)) # 取七成作為訓練集
training <- iris[index,]
testing <- iris[-index,]

#訓練 (k=3)
knn.iris <- knn(training[,-5], testing[,-5], training[,5], k = 3)

# 結果輸出
table(result=knn.iris, testing[,5], dnn=c("Prediction","Actual"))
cat("\n  Accuracy:",
    sum(as.numeric(knn.iris == testing[,5])) / nrow(testing)*100,"%")

            Actual
Prediction   setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         14         2
  virginica       0          1        13


  Accuracy: 93.33333 %

# 性別資料集

In [3]:
gender_size <- read.csv("../data_files/gender_size.csv", stringsAsFactors = TRUE)

# 用tune.knn測試最佳的群數
tune.gender <- tune.knn(gender_size[,-4], gender_size[,4], k=c(3,5,7,9))
cat('The best k number:', tune.gender$best.model$k)

#訓練
index <- sample(1:nrow(gender_size),nrow(gender_size)*0.7)
training <- gender_size[index,]
testing <- gender_size[-index,]
knn.gender <- knn(training[,-4], testing[,-4], training[,4],
                  k = tune.gender$best.model$k)

# 測試集結果
table(knn.gender, testing[,4], dnn=c("Prediction","Actual"))
cat("\n  Accuracy:",
    sum(as.numeric(knn.gender == testing[,4])) / nrow(testing)*100,"%")

The best k number: 7

          Actual
Prediction female male
    female     13    2
    male        2   26


  Accuracy: 90.69767 %

In [4]:
# 寫成函式 (輸入knn_data最後一列為類別資料)
best_knn <- function(knn_data) {
    last <- ncol(knn_data)
    tune.knn <- tune.knn(knn_data[,-last], knn_data[,last], k=c(3,5,7,9))
    cat('The best k number:', tune.knn$best.model$k, '\n')
    
    index <- sample(1:nrow(knn_data), nrow(knn_data)*0.7)
    result.knn <- knn(knn_data[index,-last], knn_data[-index,-last], knn_data[index,last],
                      k = tune.knn$best.model$k)

    
    print(table(result.knn, knn_data[-index,last], dnn=c("Prediction","Actual")))
    cat("\n  Accuracy:",
        sum(as.numeric(knn_data[-index,last]==result.knn)) / nrow(knn_data[-index,])*100,"%\n\n")
    }

# 癌症資料集

In [5]:
cancer_data <- read.csv("../data_files/breast_cancer.csv", header=T, sep=",")
cancer <- cancer_data[cancer_data$Bare.Nuclei!= 9999, 2:11]
cancer$Class <- as.factor(cancer$Class)

cat("cancer_dataset:\n")
best_knn(cancer)

cancer_dataset:
The best k number: 7 
           Actual
Prediction  benign malignant
  benign       121         5
  malignant      5        74

  Accuracy: 95.12195 %



# 玻璃資料集

In [6]:
glass_data<- read.table("../data_files/glass.txt", header=F, sep=",") 
glass<-glass_data[,-1] #remove unnecessary column
colnames(glass)=c("x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "type") 
glass[,10]<- as.factor(glass$type)

cat("glass_dataset:\n")
best_knn(glass)

glass_dataset:
The best k number: 3 
          Actual
Prediction  1  2  3  5  6  7
         1 14  2  5  0  0  0
         2  3 19  1  3  1  0
         3  0  0  2  0  0  0
         5  0  0  0  3  0  0
         6  0  0  0  0  0  0
         7  0  0  0  0  0 12

  Accuracy: 76.92308 %

