## Importing libraries and the data

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

In [2]:
iris = load_iris()
iris_df = pd.DataFrame(iris.data, 
                       columns = iris.feature_names)
iris_df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
iris_df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [3]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   sepal length (cm)  150 non-null    float64 
 1   sepal width (cm)   150 non-null    float64 
 2   petal length (cm)  150 non-null    float64 
 3   petal width (cm)   150 non-null    float64 
 4   species            150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB


In [4]:
iris_df.shape

(150, 5)

In [5]:
# splitting data to train and test
x = iris_df.iloc[:, :-1].values
y = iris_df.iloc[:, -1].values

In [6]:
x, y

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5

In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    random_state = 42)

In [8]:
x_train.shape, x_test.shape

((105, 4), (45, 4))

## Applying z-score

In [9]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [10]:
x_train, x_test

(array([[-0.4134164 , -1.46200287, -0.09951105, -0.32339776],
        [ 0.55122187, -0.50256349,  0.71770262,  0.35303182],
        [ 0.67180165,  0.21701605,  0.95119225,  0.75888956],
        [ 0.91296121, -0.02284379,  0.30909579,  0.2177459 ],
        [ 1.63643991,  1.41631528,  1.30142668,  1.70589097],
        [-0.17225683, -0.26270364,  0.19235097,  0.08245999],
        [ 2.11875905, -0.02284379,  1.59328871,  1.16474731],
        [-0.29283662, -0.02284379,  0.36746819,  0.35303182],
        [-0.89573553,  1.17645543, -1.44207638, -1.40568508],
        [ 2.23933883, -0.50256349,  1.65166111,  1.0294614 ],
        [-0.05167705, -0.74242333,  0.13397857, -0.32339776],
        [-0.77515575,  0.93659559, -1.44207638, -1.40568508],
        [-1.01631531,  1.17645543, -1.50044878, -1.27039917],
        [-0.89573553,  1.89603497, -1.15021435, -1.13511325],
        [-1.01631531, -2.42144225, -0.21625586, -0.32339776],
        [ 0.55122187, -0.74242333,  0.60095781,  0.75888956],
        

## Gaussian Naive Bayes

In [11]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB()

In [12]:
x_train, y_train

(array([[-0.4134164 , -1.46200287, -0.09951105, -0.32339776],
        [ 0.55122187, -0.50256349,  0.71770262,  0.35303182],
        [ 0.67180165,  0.21701605,  0.95119225,  0.75888956],
        [ 0.91296121, -0.02284379,  0.30909579,  0.2177459 ],
        [ 1.63643991,  1.41631528,  1.30142668,  1.70589097],
        [-0.17225683, -0.26270364,  0.19235097,  0.08245999],
        [ 2.11875905, -0.02284379,  1.59328871,  1.16474731],
        [-0.29283662, -0.02284379,  0.36746819,  0.35303182],
        [-0.89573553,  1.17645543, -1.44207638, -1.40568508],
        [ 2.23933883, -0.50256349,  1.65166111,  1.0294614 ],
        [-0.05167705, -0.74242333,  0.13397857, -0.32339776],
        [-0.77515575,  0.93659559, -1.44207638, -1.40568508],
        [-1.01631531,  1.17645543, -1.50044878, -1.27039917],
        [-0.89573553,  1.89603497, -1.15021435, -1.13511325],
        [-1.01631531, -2.42144225, -0.21625586, -0.32339776],
        [ 0.55122187, -0.74242333,  0.60095781,  0.75888956],
        

In [13]:
sample_data = [[5.9, 3.0, 5.1, 1.8]]
scaled_data = sc.transform(sample_data)
prediction = classifier.predict(scaled_data)
prediction

array(['virginica'], dtype='<U10')

In [14]:
y_prediction = classifier.predict(x_test)
y_pred_column = y_prediction.reshape(len(y_prediction), 1)
y_test_column = y_test.reshape(len(y_test), 1)
combined = np.concatenate((y_pred_column, y_test_column), axis = 1)

df = pd.DataFrame(combined, 
                  columns = ['Predicted', 'Actual'])

In [15]:
df.head(10)

Unnamed: 0,Predicted,Actual
0,versicolor,versicolor
1,setosa,setosa
2,virginica,virginica
3,versicolor,versicolor
4,versicolor,versicolor
5,setosa,setosa
6,versicolor,versicolor
7,virginica,virginica
8,versicolor,versicolor
9,versicolor,versicolor


## Confusion matrix

In [16]:
cm = confusion_matrix(y_test, y_prediction)
cm

array([[19,  0,  0],
       [ 0, 12,  1],
       [ 0,  0, 13]])

In [17]:
cm_df = pd.DataFrame(cm, 
                     index = ['Actual setosa', 'Actual versicolor', 'Actual virginica'], 
                     columns = ['Predicted setosa', 'Predicted versicolor', 'Predicted virginica'])
cm_df

Unnamed: 0,Predicted setosa,Predicted versicolor,Predicted virginica
Actual setosa,19,0,0
Actual versicolor,0,12,1
Actual virginica,0,0,13


## Performance metrics

In [18]:
# Initialize the sum of True Negative Rates
sum_TNR = 0

# Calculate the TNR for each class
for i in range(len(cm)):
    TN = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
    FP = np.sum(cm[:, i]) - cm[i, i]
    sum_TNR += TN / (TN + FP) if (TN + FP) != 0 else 0

# Calculate the macro-averaged TNR
macro_averaged_TNR = sum_TNR / len(cm)

macro_averaged_TNR

0.9895833333333334

In [19]:
accuracy = accuracy_score(y_test, y_prediction)
precision = precision_score(y_test, y_prediction, average='macro')
recall = recall_score(y_test, y_prediction, average='macro')  # Also known as TP Rate
f1 = f1_score(y_test, y_prediction, average='macro')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall (TP Rate): {recall}')
print(f'TN - Rate: {macro_averaged_TNR}')
print(f'F1-Score: {f1}')

Accuracy: 0.9777777777777777
Precision: 0.9761904761904763
Recall (TP Rate): 0.9743589743589745
TN - Rate: 0.9895833333333334
F1-Score: 0.974320987654321
