# Naive Bayes classification

In [1]:
from math import pi, sqrt, exp
from random import randrange, seed
from csv import reader

In [2]:
dataset = [[3.393533211,2.331273381,0],
	[3.110073483,1.781539638,0],
	[1.343808831,3.368360954,0],
	[3.582294042,4.67917911,0],
	[2.280362439,2.866990263,0],
	[7.423436942,4.696522875,1],
	[5.745051997,3.533989803,1],
	[9.172168622,2.511101045,1],
	[7.792783481,3.424088941,1],
	[7.939820817,0.791637231,1]]

In [3]:
def mean(X):
    return sum(X)/ len(X)

def stdev(X):
	mu = mean(X)
	return  sum((x - mu)**2 for x in X) / (len(X) - 1)

In [4]:
d = [[1, 3, 4],
     [1, 2, 3],
     [2, 3, 4],
     [1, 2, 3]]
d = [x[:-1]for x in d]
print(*d)
for column in zip(*d):
    print(column)


[1, 3] [1, 2] [2, 3] [1, 2]
(1, 1, 2, 1)
(3, 2, 3, 2)


In [5]:
def summarize_data(dataset):
	rm_last_col = [row[:-1] for row in dataset]
	return [(mean(col), stdev(col), len(col)) for col in zip(*rm_last_col)]
summarize_data(dataset)

[(5.178333386499999, 7.653989826170761, 10),
 (2.9984683241, 1.4848795625703213, 10)]

In [6]:
def separated_by_class(dataset):
	"""
	Separate the data set into classes
	Assume the final colulmn is ground truth of the class
	"""
	classes = dict()
	for row in dataset:
		if row[-1] not in classes:
			classes[row[-1]] = list()
		classes[row[-1]].append(row)
	return classes

In [7]:
for classes, rows in separated_by_class(dataset).items():
	print(classes, rows)
separated_by_class(dataset).items()

def summarize_by_class(dataset):
	separated = separated_by_class(dataset)
	sumaries = dict()
	for class_val, rows_class_val in separated.items():
		sumaries[class_val] = summarize_data(rows_class_val)
	return sumaries
print(summarize_by_class(dataset))

0 [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0]]
1 [[7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]
{0: [(2.7420144012, 0.8585288681757653, 5), (3.0054686692, 1.2261788197598094, 5)], 1: [(7.6146523718, 1.5238227453753934, 5), (2.9914679790000003, 2.1146776839446155, 5)]}


## Gaussian Probability

In [8]:
def gauss(x, mean, stdev):
    if stdev == 0:
        return 0
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return 1 / (sqrt(2 * pi) * stdev ) * exponent


## Class Probabilities

In [12]:
# summaries = summarize_by_class(dataset[:][:-2])
# print([summaries[label][0][2] for label in summaries])
# for class_val, sumary_class_val in summaries.items():
#     print(class_val, sumary_class_val, '\n')


def MAP(dataset, input_row):
    """
    Maximum a Posteriori (MAP) 
    h = argmax_h P(h|D) = argmax_h P(D|h) * P(h)
    P(D|h) = P(D1|h) * P(D2|h) * ... * P(Dn|h)
    
    where, 
        D: real data
        h: a hypothesis, in classification, a class
        P(D|h) : Likelihood distribution of Data given a specific hypothesis, in classification,
            a specific probability distribution and its parameters
        P(h): Prior distribution
        P(h|D): Most likely hypothesis given data, in classification, Most likely class given data
    """
    total_rows = len(dataset)
    summaries = summarize_by_class(dataset)
    prob = dict()
    max_prob, retClass = 0.0, None
    for class_val, class_summaries in summaries.items():
        print(class_val)
        prob[class_val] = class_summaries[0][2] / float(total_rows)
        # except the last column is ground truth
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            prob[class_val] *= gauss(input_row[i], mean, stdev)
        if max_prob < prob[class_val]:
            retClass = class_val
            max_prob = prob[class_val]
    return retClass, prob


# MAP(dataset, [1, 2, 3])

# Naive Bayes Algorithm

def naive_bayes(train_set, test_set):
    preidictions = list()
    for row in test_set:
        preidictions.append(MAP(train_set,row))
    return preidictions

# naive_bayes(dataset[:7], dataset[7:])

## Work with Real Data

In [10]:
# Load a CSV file
def load_csv(filename):
    file = open(filename, "rt")
    lines = reader(file)
    dataset = list(lines)
    return dataset

# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column])

# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, algorithm, n_folds, *args):
    folds = cross_validation_split(dataset, n_folds)
    scores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])  # concatenate lists of lists to a list
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            # test_set use to predict => no need to hold [class] data
            row_copy[-1] = None
        predicted = algorithm(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        scores.append(accuracy)
    return scores

In [11]:
# Make a prediction with Naive Bayes on Iris Dataset
filename = 'data/sonar.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
	str_column_to_float(dataset, i)
# convert class column to integers
# str_column_to_int(dataset, len(dataset[0])-1)
# fit model
model = summarize_by_class(dataset)
# define a new record
row = [5.7,2.9,4.2,1.3]
# predict the label
label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

NameError: name 'predict' is not defined