# Naive Bayes classification

In [12]:
from math import pi, sqrt, exp
from random import randrange, seed
from csv import reader

## Create dataset

In [13]:
dataset = [[3.393533211, 2.331273381, 0],
           [3.110073483, 1.781539638, 0],
           [1.343808831, 3.368360954, 0],
           [3.582294042, 4.67917911, 0],
           [2.280362439, 2.866990263, 0],
           [7.423436942, 4.696522875, 1],
           [5.745051997, 3.533989803, 1],
           [9.172168622, 2.511101045, 1],
           [7.792783481, 3.424088941, 1],
           [7.939820817, 0.791637231, 1]]


## Find mean, variance, length of each attributes (columns), except for the last column (ground truth value)

In [14]:
def mean(X):
    return sum(X)/ len(X)

def stdev(X):
	mu = mean(X)
	return  sum((x - mu)**2 for x in X) / (len(X) - 1)

In [15]:
d = [[1, 3, 4],
     [1, 2, 3],
     [2, 3, 4],
     [1, 2, 3]]
print(*d)
for column in zip(*d):
    print(column)

[1, 3, 4] [1, 2, 3] [2, 3, 4] [1, 2, 3]
(1, 1, 2, 1)
(3, 2, 3, 2)
(4, 3, 4, 3)


In [16]:
def summarize_data(dataset):
	# except the last column is ground truth
	rm_last_col = [row[:-1] for row in dataset]
	return [(mean(col), stdev(col), len(col)) for col in zip(*rm_last_col)]
summarize_data(dataset)

[(5.178333386499999, 7.653989826170761, 10),
 (2.9984683241, 1.4848795625703213, 10)]

In [17]:
def separated_by_class(dataset):
	"""
	Separate the data set into classes
	Assume the final colulmn is ground truth of the class
	"""
	classes = dict()
	for row in dataset:
		if row[-1] not in classes:
			classes[row[-1]] = list()
		classes[row[-1]].append(row)
	return classes
separated_by_class(dataset)

{0: [[3.393533211, 2.331273381, 0],
  [3.110073483, 1.781539638, 0],
  [1.343808831, 3.368360954, 0],
  [3.582294042, 4.67917911, 0],
  [2.280362439, 2.866990263, 0]],
 1: [[7.423436942, 4.696522875, 1],
  [5.745051997, 3.533989803, 1],
  [9.172168622, 2.511101045, 1],
  [7.792783481, 3.424088941, 1],
  [7.939820817, 0.791637231, 1]]}

In [18]:
def summarize_by_class(dataset):
	separated = separated_by_class(dataset)
	sumaries = dict()
	for class_val, rows_class_val in separated.items(): # items() means (keys, values)
		sumaries[class_val] = summarize_data(rows_class_val)
	return sumaries
print(summarize_by_class(dataset))

{0: [(2.7420144012, 0.8585288681757653, 5), (3.0054686692, 1.2261788197598094, 5)], 1: [(7.6146523718, 1.5238227453753934, 5), (2.9914679790000003, 2.1146776839446155, 5)]}


## Gaussian Probability

In [19]:
def gauss(x, mean, stdev):
    assert stdev !=0
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return 1 / (sqrt(2 * pi) * stdev ) * exponent


## Class Probabilities

In [20]:
def MAP(dataset, input_row):
    """
    Maximum a Posteriori (MAP) 
    h = argmax_h P(h|D) = argmax_h P(D|h) * P(h)
    P(D|h) = P(D1|h) * P(D2|h) * ... * P(Dn|h)
    
    where, 
        D: real data
        h: a hypothesis, in classification, a class
        P(D|h) : Likelihood distribution of Data given a specific hypothesis, in classification,
            a specific probability distribution and its parameters
        P(h): Prior distribution
        P(h|D): Most likely hypothesis given data, in classification, Most likely class given data
    """
    total_rows = len(dataset)
    summaries = summarize_by_class(dataset)
    prob = dict()
    max_prob, retClass = 0.0, None
    for class_val, class_summaries in summaries.items():
        prob[class_val] = class_summaries[0][2] / float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            prob[class_val] *= gauss(input_row[i], mean, stdev)
        if max_prob < prob[class_val]:
            retClass = class_val
            max_prob = prob[class_val]
    return retClass, prob

# Naive Bayes Algorithm
def naive_bayes(train_set, test_set):
    preidictions = list()
    for row in test_set:
        preidictions.append(MAP(train_set,row)[0])
        # print(MAP(train_set,row))
    return preidictions

## Work with Real Data

In [21]:
# Load a CSV file
def load_csv(filename):
    file = open(filename, "rt")
    lines = reader(file)
    dataset = list(lines)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])  # concatenate lists of lists to a list
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            # test_set use to predict => no need to hold [class] data
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [22]:
# Make a prediction with Naive Bayes on Iris Dataset
filename = 'data/BankNote_Authentication.csv'
dataset = load_csv(filename)

# remove the string [attributes]
dataset.pop(0)
for i in range(len(dataset[0])-1): # except for the last column
	str_column_to_float(dataset, i)

# fit model
model = summarize_by_class(dataset)

# predict the label
n_folds = 5

scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

Scores: [59.48905109489051, 62.40875912408759, 63.503649635036496, 75.91240875912408, 60.58394160583942]
Mean Accuracy: 64.380%


## Conclusion

This model cannot work with huge attributes, i.e `data/sonar.csv` due to the multiplication increase when # of attributes increase. The probability of each class rapidly converges to zero and we cannot use it to the the Maximum a Posteriori (MAP).

I tried to print the probability of these classes:

2.9139230184726384e-13

1.0403767988304062e-181

0.0

0.0

...

## Refs
https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/