# 樸素貝式分類器 (Naive Bayes classifier)

In [1]:
library(MASS)
library(klaR)

# Iris 資料集

In [2]:
data(iris)
index <- c(sample(1:50,35), sample(51:100,35), sample(101:150,35))
training <- iris[index,]
testing <- iris[-index,]

fit_bayes <- NaiveBayes(Species ~ ., data=training)
pre_bayes <- predict(fit_bayes, testing)
table(pre_bayes$class, testing$Species, dnn=c("Prediction","Actual"))
cat("\n  Accuracy:",
    sum(as.numeric(pre_bayes$class == testing$Species)) / nrow(testing)*100,"%")

            Actual
Prediction   setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         14         2
  virginica       0          1        13


  Accuracy: 93.33333 %

In [3]:
# fit的結果
fit_bayes$apriori

# tables of mean and standard deviation (in that order)
print(fit_bayes$tables)

grouping
    setosa versicolor  virginica 
 0.3333333  0.3333333  0.3333333 

$Sepal.Length
               [,1]      [,2]
setosa     5.014286 0.3264271
versicolor 5.902857 0.5601920
virginica  6.645714 0.5937631

$Sepal.Width
               [,1]      [,2]
setosa     3.474286 0.3509172
versicolor 2.802857 0.3129421
virginica  3.045714 0.2953989

$Petal.Length
               [,1]      [,2]
setosa     1.457143 0.1786974
versicolor 4.211429 0.4837181
virginica  5.594286 0.5246527

$Petal.Width
                [,1]      [,2]
setosa     0.2542857 0.1171797
versicolor 1.3285714 0.1856173
virginica  2.0514286 0.2715656



In [4]:
# 寫成函式
fit_NaiveBayes <- function(rawdata) {
    # print(rawdata[index,class])
    last <- ncol(rawdata)
    index <- sample(1:nrow(rawdata), round(nrow(rawdata)*0.7))
    
    fit_bayes <- NaiveBayes(rawdata[index,last] ~ ., data=rawdata[index,-last])
    pre_bayes <- predict(fit_bayes, rawdata[-index,-last])
    print(table(pre_bayes$class, rawdata[-index,last], dnn=c("Prediction","Actual")))
    cat("\n  Accuracy:",
        sum(as.numeric(pre_bayes$class == rawdata[-index,last])) / nrow(rawdata[-index,])*100,"%\n\n")
    return(fit_bayes)
    }

# 性別資料集

In [5]:
gender_size <- read.csv("../data_files/gender_size.csv", stringsAsFactors = TRUE)
fit_bayes <- fit_NaiveBayes(gender_size)

          Actual
Prediction female male
    female     13    2
    male        2   25

  Accuracy: 90.47619 %



In [6]:
# 預測資料(1 -> female, 2 -> male)
x1 <- data.frame(Height= 175, Weight=72, Waist=32)
x2 <- data.frame(Height= 165, Weight=58, Waist=28)
cat('\nx1:', predict(fit_bayes, x1)$class)
cat('\nx2:', predict(fit_bayes, x2)$class)


x1: 2
x2: 2

# 鐵達尼號資料集

In [7]:
titanic_data <-read.csv("../data_files/titanic.csv", header=T, sep=",",stringsAsFactors = TRUE)

# 整理數據
titanic <- na.exclude(titanic_data) #remove missing values
rw <- c(which(titanic$age==9999), which(titanic$fare==9999))  
#identify the id which has missing values in age or fare
titan <- titanic[-rw, 2:5]
titan$survival <- titanic[-rw, 10] #set survival

titan[which(titan[,1]==0), 1]<-"male"  #rename 0 by male
titan[which(titan[,1]==1), 1]<-"female"  #rename 1 by female
titan[,1] <- as.factor(titan$gender)

titan[which(titan[,5]==1), 5]<-"yes"
titan[which(titan[,5]==0), 5]<-"no"
titan[,5]<-as.factor(titan[,"survival"])

fit_bayes <-fit_NaiveBayes(titan)

          Actual
Prediction  no yes
       no  232  75
       yes  16  63

  Accuracy: 76.42487 %

