## Import Package

In [9]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from naive_bayes_functions import train_test_split, naive_bayes_param, predict, calculate_accuracy, str_convert_float

## Iris Data Set (Continuous Features)

### 1 Data Preparation

In [10]:
df = pd.read_csv('data/Iris.csv', index_col=0)
train_data, test_data = train_test_split(df, 0.2)
label_column = test_data.columns[-1]
test_labels = test_data[label_column]
test_data = test_data.drop(label_column, axis=1)
train_data.head()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width,species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
6,5.4,3.9,1.7,0.4,Iris-setosa


### 2 Implementation and Test of Naive Bayes

In [11]:
model = naive_bayes_param(train_data)
predict_labels = predict(model, test_data)
print(f'Accuracy of My Naive Bayes: {calculate_accuracy(predict_labels, test_labels)}')
pd.crosstab(test_labels, predict_labels, rownames=[label_column], colnames=["prediction"])

Accuracy of My Naive Bayes: 0.9


prediction,Iris-setosa,Iris-versicolor,Iris-virginica
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,9,0,0
Iris-versicolor,0,10,1
Iris-virginica,0,2,8


### 3 Compare With Sklearn Naive Bayes

In [12]:
gnb = GaussianNB()
gnb.fit(train_data.drop(label_column, axis=1), train_data[label_column])
predict_labels = gnb.predict(test_data)
print(f'Accuracy of Sklearn Naive Bayes: {calculate_accuracy(predict_labels, test_labels)}')
pd.crosstab(test_labels, predict_labels, rownames=[label_column], colnames=["prediction"])

Accuracy of Sklearn Naive Bayes: 0.9


prediction,Iris-setosa,Iris-versicolor,Iris-virginica
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Iris-setosa,9,0,0
Iris-versicolor,0,10,1
Iris-virginica,0,2,8


## Titanic Data Set (Combination of Continuous and Discrete Features)

### 1 Data Preparation

In [13]:
df = pd.read_csv('data/Titanic.csv')
df_labels = df.Survived
label_column = 'Survived'
df = df.drop(['PassengerId', 'Survived', 'Name', 'Ticket', 'Cabin'], axis=1)
df[label_column] = df_labels
# Handling missing values
median_age = df.Age.median()
mode_embarked = df.Embarked.mode()[0]
df = df.fillna({'Age': median_age, 'Embarked': mode_embarked})
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,22.0,1,0,7.25,S,0
1,1,female,38.0,1,0,71.2833,C,1
2,3,female,26.0,0,0,7.925,S,1
3,1,female,35.0,1,0,53.1,S,1
4,3,male,35.0,0,0,8.05,S,0


### 2 Split Data Set

In [14]:
train_data, test_data = train_test_split(df, 0.1)
test_labels = test_data[label_column]
test_data = test_data.drop(label_column, axis=1)

### 3 Implementation and Test of Naive Bayes

In [15]:
model = naive_bayes_param(train_data)
predict_labels = predict(model, test_data)
print(f'Accuracy of My Naive Bayes: {calculate_accuracy(predict_labels, test_labels)}')
pd.crosstab(test_labels, predict_labels, rownames=[label_column], colnames=["prediction"])

Accuracy of My Naive Bayes: 0.7303370786516854


prediction,0,1
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,39,6
1,18,26


### 4 Compare With Sklearn Naive Bayes

In [16]:
# Since sklearn doesn't seem to support mixed features
# I need to convert the str feature to number
str_convert_float(train_data)
str_convert_float(test_data)
mnb = MultinomialNB()
mnb.fit(train_data.drop(label_column, axis=1), train_data[label_column])
predict_labels = mnb.predict(test_data)
print(f'Accuracy of Sklearn Naive Bayes: {calculate_accuracy(predict_labels, test_labels)}')
pd.crosstab(test_labels, predict_labels, rownames=[label_column], colnames=["prediction"])

Accuracy of Sklearn Naive Bayes: 0.6404494382022472


prediction,0,1
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,37,8
1,24,20
