In [1]:
# importing the required libraries
import numpy as np
import pandas as pd
import random

In [2]:
# Reading the csv
iris_values = []

f = open('iris.csv') 
next(f)
data = [line.split(',') for line in f.readlines()]
for row in data:
    a = []
    for val in row:
        a.append(float(val))
    iris_values.append(a)
f.close()

random.shuffle(iris_values)

In [3]:
# print(iris_values)

# discretising the values using the np.round_ function
discrete_iris = np.round_(iris_values)
print(discrete_iris)

[[5. 4. 2. 0. 0.]
 [5. 3. 2. 0. 0.]
 [6. 3. 4. 1. 1.]
 [7. 3. 5. 2. 2.]
 [6. 3. 4. 1. 1.]
 [7. 3. 5. 2. 2.]
 [5. 3. 4. 1. 1.]
 [5. 3. 2. 0. 0.]
 [5. 4. 2. 0. 0.]
 [7. 3. 6. 2. 2.]
 [6. 3. 4. 1. 1.]
 [5. 3. 1. 0. 0.]
 [6. 3. 5. 2. 2.]
 [6. 4. 1. 0. 0.]
 [6. 4. 2. 0. 0.]
 [5. 3. 1. 0. 0.]
 [5. 3. 2. 0. 0.]
 [5. 4. 2. 0. 0.]
 [6. 3. 5. 2. 2.]
 [8. 3. 7. 2. 2.]
 [6. 3. 5. 2. 2.]
 [7. 3. 5. 1. 1.]
 [4. 2. 1. 0. 0.]
 [6. 3. 6. 2. 2.]
 [6. 3. 5. 2. 2.]
 [6. 3. 6. 1. 2.]
 [6. 2. 4. 1. 1.]
 [5. 3. 2. 0. 0.]
 [6. 3. 5. 1. 1.]
 [5. 3. 2. 0. 0.]
 [6. 3. 5. 2. 1.]
 [5. 4. 2. 0. 0.]
 [5. 3. 2. 0. 0.]
 [6. 4. 1. 0. 0.]
 [8. 3. 7. 2. 2.]
 [6. 3. 6. 2. 2.]
 [6. 2. 4. 1. 1.]
 [6. 3. 4. 2. 1.]
 [5. 4. 1. 0. 0.]
 [7. 3. 6. 2. 2.]
 [6. 3. 5. 2. 2.]
 [5. 2. 3. 1. 1.]
 [6. 3. 5. 2. 2.]
 [5. 4. 2. 0. 0.]
 [7. 3. 5. 1. 1.]
 [5. 3. 2. 0. 0.]
 [5. 3. 2. 0. 0.]
 [6. 3. 4. 2. 1.]
 [6. 3. 5. 2. 2.]
 [7. 2. 6. 2. 2.]
 [6. 2. 4. 1. 1.]
 [7. 3. 5. 1. 1.]
 [7. 3. 6. 2. 2.]
 [7. 4. 6. 2. 2.]
 [7. 3. 5. 2. 2.]
 [5. 3. 1.

In [4]:
# function to calculate the priors for different classes
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

In [5]:
# calculating the likelihood (1st term in bayes theorem)
def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * max(0.001,std))) *  np.exp(-((feat_val-mean)**2 / (2 * max(0.001,std**2) )))
#     print(std)
#     print(feat_val-mean)
    return p_x_given_y

In [6]:
# main code for predicting multiple samples at a time
def naive_bayes_gaussian(df, X, Y):
    # getting names of features using df dataframe column names
    features = list(df.columns)[:-1]
    # calculating prior
    prior = calculate_prior(df, Y)
    
    Y_pred = []
    # loop over every data sample
    count=0
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        
        for j in range(len(labels)):
            for i in range(len(features)):
                count+=1
#                 print(count)
                likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])

        # calculating posterior probability by using bayes theorem  
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))
        # returning max values of all possible classes
    return np.array(Y_pred) 

In [7]:
# spliting the dataset into train and test (120 and 30)
# plotting the predictins as a confusion matrix
# also using the f1 score as a metric to check how good the model is
# this is the dicretised data
from sklearn.model_selection import train_test_split
train, test = train_test_split(discrete_iris, test_size=.2, random_state=41)

test_x = []
test_y = []
for i in range(len(test)):
    test_x.append([test[i][0], test[i][1], test[i][2],
                      test[i][3]])
    test_y.append(test[i][4])

train_pd = pd.DataFrame(train, columns=['a', 'b', 'c', 'd', 'target'])

# print(train_pd.head())

Y_pred = naive_bayes_gaussian(train_pd, X=test_x, Y="target")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(test_y, Y_pred))
print(f1_score(test_y, Y_pred, average='macro'))

[[9 0 0]
 [0 9 4]
 [0 0 8]]
0.8727272727272727


In [8]:
# plotting the predictins as a confusion matrix
# non-discretised data
from sklearn.model_selection import train_test_split
train, test = train_test_split(iris_values, test_size=.2, random_state=41)

test_x = []
test_y = []
for i in range(len(test)):
    test_x.append([test[i][0], test[i][1], test[i][2],
                      test[i][3]])
    test_y.append(test[i][4])

train_pd = pd.DataFrame(train, columns=['a', 'b', 'c', 'd', 'target'])

# print(train_pd.head())

Y_pred = naive_bayes_gaussian(train_pd, X=test_x, Y="target")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(test_y, Y_pred))
print(f1_score(test_y, Y_pred, average='macro'))

[[ 9  0  0]
 [ 0 10  3]
 [ 0  0  8]]
0.9038901601830664


In [9]:
# reading the training data .dat file
train_set = []

f = open('pp_tra.dat') 
next(f)
data = [line.split(' ') for line in f.readlines()]
for row in data:
    a = []
    for val in row:
        a.append(int(val))
    train_set.append(a)
f.close()

random.shuffle(train_set)

In [10]:
# reading the testing data .dat file
test_set = []

f = open('pp_tes.dat') 
next(f)
data = [line.split(' ') for line in f.readlines()]
for row in data:
    a = []
    for val in row:
        a.append(int(val))
    test_set.append(a)
f.close()

random.shuffle(test_set)

In [11]:
# running the same bayes classifier as before 

print(len(train_set))
print(len(test_set))

test_x = []
test_y = []
for i in range(len(test_set)):
    a = []
    for j in range(len(test_set[1])):
        a.append(test_set[i][j])
    test_y.append(test_set[i][-1])
    test_x.append(a)

train_pd = pd.DataFrame(train_set).rename(columns=str).rename(columns={'192': 'target'})

# print(train_pd.head())

Y_pred = naive_bayes_gaussian(train_pd, X=test_x, Y="target")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(test_y, Y_pred))
print(f1_score(test_y, Y_pred, average='macro'))

6669
3332


KeyboardInterrupt: 