## Naive Bayes

In [4]:
#Loading required packages
library(tidyverse)
library(ggplot2)
library(caret)
library(psych)
library(rpart)
# install.packages("randomForest")
library(randomForest)
# install.packages("Amelia")
library(Amelia)
library(mice)
library(e1071)
# install.packages("klaR")
library(klaR)

### This example comes from [https://www.edureka.co/blog/naive-bayes-in-r/](https://www.edureka.co/blog/naive-bayes-in-r/)

### For a very nice manuall python tutorial see [https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/](https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/)

In [None]:
diabetes = read.csv("diabetes.csv", header = T)
str(diabetes)

In [None]:
describe(diabetes)

In [None]:
#Convert '0' values into NA
diabetes[, 2:7][diabetes[, 2:7] == 0] <- NA

In [None]:
#visualize the missing data
missmap(diabetes)

In [None]:
###We can impute missing values using different techniques (k-mers, nearest neighbors)
mice_mod <- mice(diabetes[, c("Glucose","BloodPressure","SkinThickness","Insulin","BMI")], method='rf')
mice_complete <- complete(mice_mod)

In [None]:
#Transfer the predicted missing values into the main diabetes set
diabetes$Glucose <- mice_complete$Glucose
diabetes$BloodPressure <- mice_complete$BloodPressure
diabetes$SkinThickness <- mice_complete$SkinThickness
diabetes$Insulin<- mice_complete$Insulin
diabetes$BMI <- mice_complete$BMI



In [None]:
missmap(diabetes)

In [None]:
#Data Visualization
ggplot(diabetes, aes(Age, colour = Outcome)) +
  geom_freqpoly(binwidth = 1) + 
  labs(title="Age Distribution by Outcome")

In [None]:
#visual 2
c <- ggplot(diabetes, aes(x=Pregnancies, fill=Outcome, color=Outcome)) +
  geom_histogram(binwidth = 1) + labs(title="Pregnancy Distribution by Outcome")
c + theme_bw()

In [None]:
#visual 3
P <- ggplot(diabetes, aes(x=BMI, fill=Outcome, color=Outcome)) +
  geom_histogram(binwidth = 1) + labs(title="BMI Distribution by Outcome")
P + theme_bw()

In [None]:
#visual 4
ggplot(diabetes, aes(Glucose, colour = Outcome)) +
  geom_freqpoly(binwidth = 1) + labs(title="Glucose Distribution by Outcome")


In [None]:
#visual 5
ggpairs(diabetes)

In [None]:
#Building a model
#split data into training and test data sets
library(caret)
set.seed(998)
indxTrain <- createDataPartition(y = diabetes$Outcome,p = 0.75,list = FALSE)
training <- diabetes[indxTrain,]
testing <- diabetes[-indxTrain,]

In [None]:
#Check dimensions of the split

prop.table(table(diabetes$Outcome)) * 100
prop.table(table(training$Outcome)) * 100
prop.table(table(testing$Outcome)) * 100

In [None]:
#create objects x which holds the predictor variables and y 
#which holds the response variables
x = training[,-9]
y = training$Outcome

In [None]:
model = train(x,y,'naive_bayes',trControl=trainControl(method='cv',number=10))
#?naive_bayes

In [None]:
#Model Evaluation
#Predict testing set
Predict <- predict(model,newdata = testing )

In [None]:
#Get the confusion matrix to see accuracy value and other parameter values

confusionMatrix(Predict, testing$Outcome )

In [None]:
#Plot Variable performance
X <- varImp(model)
plot(X)