## Supplement 4: Classification

In [18]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB

### 4.1 Programming Task: Gaussian Naive-Bayes Classifier
The Iris dataset, containing measurements of the flower parts obtained from 3 different species of the Iris plant, is provided in the file __iris.csv__. The first four columns of the dataset contain the measurement values representing input features for the model and the last column contains class labels of the plant species: Iris-setosa, Iris-versicolor, and Iris-virginica.
The goal of this task is to implement a Gaussian Naive-Bayes classifier for the Iris dataset.

i\. What are the assumptions on the dataset required for the Gaussian Naive-Bayes model?  
    - Data following a Gaussian pdf and iid data

ii\. Split the dataset into train and test by the 80:20 ratio.


In [4]:
dataset = pd.read_csv('iris.csv')
# Class labels present in this dataset
class_labels = list(dataset['Species'].unique())
input_features = list(dataset.columns[:-1])
# Shuffle the dataset
dataset = dataset.sample(frac=1).reset_index(drop=True)
# Split the dataset
trainset_size = int(len(dataset) * 0.8)
trainset = dataset[:trainset_size]
testset = dataset[trainset_size:]
print('Train set size:', len(trainset))
print('Test set size:', len(testset))

Train set size: 120
Test set size: 30


iii\. Estimate the parameters of the Gaussian Naive-Bayes classifier using the train set.


In [6]:
def gaussian_probability_function(x, mean, std):
    arg = -0.5 * ((x - mean) / std)**2
    prob = 1 / (std * (np.sqrt(2*np.pi))) * np.exp(arg)
    return prob

def get_posterior(test_sample, class_name):
    # Get features from test sample
    test_sepal_l = test_sample['SepalLengthCm']
    test_sepal_w = test_sample['SepalWidthCm']
    test_petal_l = test_sample['PetalLengthCm']
    test_petal_w = test_sample['PetalWidthCm']
    # Get train samples relevant to class setosa
    trainset_given_class = trainset[trainset['Species']==class_name]
    # Get prior
    prior = len(trainset_given_class) / len(trainset)
    # Get mean and std for each feature in trainset
    mean_given_class = trainset_given_class[input_features].mean()
    std_given_class = trainset_given_class[input_features].std()
    # Model p( feature | class) for each feature as a gaussian
    prob_sepal_l_given_class = gaussian_probability_function(test_sepal_l,
                                    mean_given_class['SepalLengthCm'], std_given_class['SepalLengthCm'])
    prob_sepal_w_given_class = gaussian_probability_function(test_sepal_w,
                                    mean_given_class['SepalWidthCm'], std_given_class['SepalWidthCm'])
    prob_petal_l_given_class = gaussian_probability_function(test_petal_l,
                                    mean_given_class['PetalLengthCm'], std_given_class['PetalLengthCm'])
    prob_petal_w_given_class = gaussian_probability_function(test_petal_w,
                                    mean_given_class['PetalWidthCm'], std_given_class['PetalWidthCm'])
    # Assuming features are independent
    posterior_class = prob_sepal_l_given_class * prob_sepal_w_given_class * \
                        prob_petal_l_given_class * prob_petal_w_given_class * prior
    return posterior_class

iv\. Using the learned parameters, predict the classes for the samples in the test set.


In [16]:
posterior_setosa = get_posterior(testset, class_name='Iris-setosa')
posterior_versicolor = get_posterior(testset, class_name='Iris-versicolor')
posterior_virginica = get_posterior(testset, class_name='Iris-virginica')

posterior = pd.concat((posterior_setosa, posterior_versicolor, posterior_virginica), axis=1)
posterior.columns = class_labels
predicted_labels = posterior.idxmax(axis=1)

What is the accuracy of the model on the test set?

In [21]:
ground_truth = testset['Species']
correct_predictions = np.sum([predicted_labels == ground_truth])
print(f'{correct_predictions} out of {len(testset)} correct prediction')
accuracy = correct_predictions/len(testset)
print('Accuracy:', accuracy)
# same with sklearn.naive_bayes.GaussianNB
model = GaussianNB()
model.fit(trainset[input_features], trainset['Species'])
print('Accuracy sklearn GaussianNB:', model.score(testset[input_features], testset['Species']))

29 out of 30 correct prediction
Accuracy: 0.9666666666666667
Accuracy sklearn GaussianNB: 0.9666666666666667
