### Data Analytics III
<pre>
1. Implement Simple Naive Bayes classification algorithm using Python/R on iris.csv dataset.
2. Compute Confusion Matrix to find TP, TN, FP, FN, Accuracy, Error Rate, Precision, Recall on the given dataset
</pre>

In [157]:
import numpy as np
import pandas as pd
import math
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,recall_score,precision_score
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB


In [158]:
df = pd.read_csv('./Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [159]:
df.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [160]:
df.shape

(150, 6)

In [161]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [162]:
df.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [163]:
df.duplicated().sum()

0

In [164]:
label_encoder = preprocessing.LabelEncoder()
df['Species'] = label_encoder.fit_transform(df['Species'])

In [165]:
Y = df['Species']
X = df.drop(['Id','Species'],axis = 1)

In [166]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state=25,test_size=0.25)

In [167]:
print(f"x traing dataset size = {x_train.shape}")
print(f"x test dataset size = {x_test.shape}")
print(f"y traing dataset size = {y_train.shape}")
print(f"y test dataset size = {y_test.shape}")

x traing dataset size = (112, 4)
x test dataset size = (38, 4)
y traing dataset size = (112,)
y test dataset size = (38,)


In [168]:
model = GaussianNB()
model.fit(x_train,y_train)

In [169]:
y_pred = model.predict(x_test)
y_pred

array([0, 1, 2, 1, 2, 1, 2, 0, 1, 1, 0, 0, 0, 1, 0, 1, 2, 2, 1, 1, 1, 1,
       1, 0, 0, 2, 1, 2, 2, 0, 1, 2, 2, 0, 2, 2, 1, 0])

In [170]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[11,  0,  0],
       [ 0, 14,  2],
       [ 0,  1, 10]], dtype=int64)

In [171]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.93      0.88      0.90        16
           2       0.83      0.91      0.87        11

    accuracy                           0.92        38
   macro avg       0.92      0.93      0.92        38
weighted avg       0.92      0.92      0.92        38



In [172]:
tp0 = cm[0][0]
tp1 = cm[1][1]
tp2 = cm[2][2]
fn0 = cm[0][1] + cm[0][2]
fn1 = cm[1][0] + cm[1][2]
fn2 = cm[2][0] + cm[2][1]
fp0 = cm[1][0] + cm[2][0]
fp1 = cm[0][1] + cm[2][1]
fp2 = cm[0][2] + cm[1][2]
tn0 = cm[1][1] + cm[1][2] + cm[2][1] + cm[2][2]
tn1 = cm[0][0] + cm[0][2] + cm[2][0] + cm[2][2]
tn2 = cm[0][0] + cm[0][1] + cm[1][0] + cm[1][1]



In [173]:
acc0 = ((tp0+tn0)/(tp0+tn0+fn0+fp0))
print(acc0)
acc1 = ((tp1+tn1)/(tp1+tn1+fn1+fp1))
print(acc1)
acc2 = ((tp2+tn2)/(tp2+tn2+fn2+fp2))
print(acc2)

1.0
0.9210526315789473
0.9210526315789473


In [174]:
accuracy_score(y_test,y_pred)

0.9210526315789473

In [175]:
err0 = 1-acc0
err1 = 1-acc1
err2 = 1-acc2
print(f"error rate 0: {err0}")
print(f"error rate 1 : {err1:.2f}")
print(f"error rate 2: {err2:.2f}")

error rate 0: 0.0
error rate 1 : 0.08
error rate 2: 0.08


In [176]:
pre0 = (tp0)/(tp0+fp0)
pre1 = (tp1)/(tp1+fp1)
pre2 = (tp2)/(tp2+fp2)
print(f"precision 0: {pre0:.4f}")
print(f"precision 1: {pre1:.4f}")
print(f"precision 2: {pre2:.4f}")


precision 0: 1.0000
precision 1: 0.9333
precision 2: 0.8333


In [177]:
recall0 = (tp0)/(tp0+fn0)
recall1 = (tp1)/(tp1+fn1)
recall2 = (tp2)/(tp2+fn2)
print(f"recall 0: {recall0:.4f}")
print(f"recall 1: {recall1:.4f}")
print(f"recall 2: {recall2:.4f}")

recall 0: 1.0000
recall 1: 0.8750
recall 2: 0.9091


In [178]:
f10 = 2*((pre0*recall0)/(pre0+recall0))
f11 = 2*((pre1*recall1)/(pre1+recall1))
f12 = 2*((pre2*recall2)/(pre2+recall2))
print(f"F1 score 0: {f10:.4f}")
print(f"F1 score 1: {f11:.4f}")
print(f"F1 score 2: {f12:.4f}")

F1 score 0: 1.0000
F1 score 1: 0.9032
F1 score 2: 0.8696


In [183]:
precision = precision_score(y_test,y_pred,average=None)
precision

array([1.        , 0.93333333, 0.83333333])

In [184]:
recall = recall_score(y_test,y_pred,average=None)
recall

array([1.        , 0.875     , 0.90909091])

In [185]:
TP = tp0+tp1+tp2
TN = tn0+tn1+tn2
FP = fp0+fp1+fp2
FN = fn0+fn1+fn2
print(TP,TN,FP,FN)

35 73 3 3


In [186]:
accuracy = (TP+TN)/(TP+FP+TN+FN)
print(accuracy)

0.9473684210526315


In [187]:
precision = TP/(TP+FP)
print("Precision: ", precision)


Precision:  0.9210526315789473


In [188]:
recall = TP /(TP+FN)
print("Recall: ",recall)

Recall:  0.9210526315789473


In [190]:
error_rate = 1-accuracy
print("Error rate = ", error_rate)

Error rate =  0.052631578947368474


In [194]:
f1_score = 2*((precision*recall)/(precision+recall))
f1_score

0.9210526315789473

In [195]:
True_negative_rate = TN/(TN+FP)
print("Specificity or true negative rate : ",True_negative_rate)

Specificity or true negative rate :  0.9605263157894737


In [196]:
False_positive_rate = 1 - True_negative_rate
False_positive_rate

0.03947368421052633