In [48]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

In [49]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [50]:
from math import sqrt
from math import pi
from math import exp

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers])/float(len(numbers)-1)
    return sqrt(variance)

In [51]:
def summarize_dataset(dataset):
        summaries = [(mean(column),stdev(column),len(column)) for column in zip(*dataset)]
        del summaries[-1]
        return summaries

In [52]:
def summarize_by_class(dataset):
    separate = separate_by_class(dataset)
    summaries = dict()
    for class_values,rows in separated.items():
        summaries[class_values] = summarize_dataset(rows)
    return summaries

In [53]:
dataset = [[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]]
separated = separate_by_class(dataset)
print(separated)
#for label in separated:
#    print(label)
#    for row in separated[label]:
#        print(row)

{0: [[3.393533211, 2.331273381, 0], [3.110073483, 1.781539638, 0], [1.343808831, 3.368360954, 0], [3.582294042, 4.67917911, 0], [2.280362439, 2.866990263, 0]], 1: [[7.423436942, 4.696522875, 1], [5.745051997, 3.533989803, 1], [9.172168622, 2.511101045, 1], [7.792783481, 3.424088941, 1], [7.939820817, 0.791637231, 1]]}


In [54]:
dataset = [[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]]
summary = summarize_dataset(dataset)
print(summary)

[(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)]


In [55]:
dataset = [[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]]
summary = summarize_by_class(dataset)
print(summary)
#for label in summary:
#    print(label)
#    for row in summary[label]:
#        print(row)

{0: [(2.7420144012, 0.9265683289298018, 5), (3.0054686692, 1.1073295894898725, 5)], 1: [(7.6146523718, 1.2344321550313704, 5), (2.9914679790000003, 1.4541931384601618, 5)]}


In [56]:
def calculate_probability(x,mean,stdev):
    exponent = exp(-((mean-x)**2)/(2*(stdev**2)))
    return (1/(sqrt(2*pi)*stdev))*exponent

In [57]:
def calculate_class_probabilities(summaries,row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_values,class_summaries in summaries.items():
        probabilities[class_values] = summaries[class_values][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean,stdev,_ = class_summaries[i]
            probabilities[class_values] *= calculate_probability(row[i],mean,stdev)
    return probabilities

In [58]:
dataset = [[3.393533211,2.331273381,0],
    [3.110073483,1.781539638,0],
    [1.343808831,3.368360954,0],
    [3.582294042,4.67917911,0],
    [2.280362439,2.866990263,0],
    [7.423436942,4.696522875,1],
    [5.745051997,3.533989803,1],
    [9.172168622,2.511101045,1],
    [7.792783481,3.424088941,1],
    [7.939820817,0.791637231,1]]
summaries = summarize_by_class(dataset)
probabilities = calculate_class_probabilities(summaries, dataset[0])
print(probabilities)

{0: 0.05032427673372076, 1: 0.00011557718379945765}


In [59]:
pwd

'C:\\Users\\Vishwajeet Bhoir\\Desktop\\scratch'

# Iris Data Set

In [73]:
from csv import reader
from math import sqrt
from math import exp
from math import pi
from random import randrange

def load_csv(filename):
    dataset = list()
    with open(filename,"r") as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

def str_column_to_float(dataset,column):
    for row in dataset:
        row[column] = float(row[column].strip())

def str_column_to_int(dataset,column):
    class_values = [row[column] for row in dataset]
    lookup = dict()
    unique = set(class_values)
    for i,values in enumerate(unique):
        lookup[values] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

def cross_validation_split(dataset,n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = len(dataset)/float(n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold)<fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

def accuracy_metric(actual,predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct/float(len(actual))*100.0

def evaluate_algorithm(dataset,algorithm,n_folds):
    scores = list()
    folds = cross_validation_split(dataset,n_folds)
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set,[])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy = -1
        predicted = algorithm(train_set,test_set)
        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual,predicted)
        scores.append(accuracy)
    return scores

def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_values = vector[-1]
        if (class_values not in separated):
            separated[class_values] = list()
        separated[class_values].append(vector)
    return separated

def mean(numbers):
    return sum(numbers)/float(len(numbers))

def std_dev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers])/float(len(numbers))
    return sqrt(variance)

def summarize_dataset(dataset):
    summaries = [(mean(column),std_dev(column),len(column)) for column in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_values,row in separated.items():
        summaries[class_values] = summarize_dataset(row)
    return summaries

def calculate_probabilty(x,mean,stdev):
    exponent = exp(-((x-mean)**2)/(2*stdev**2))
    return (1/(sqrt(2*pi)*stdev))*exponent

def calculate_class_probabilities(summaries,row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_values,class_summaries in summaries.items():
        probabilities[class_values] = summaries[class_values][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean,stdev,_ = class_summaries[i]
            probabilities[class_values] *= calculate_probability(row[i],mean,stdev)
    return probabilities
    
def predict(summaries,row):
    probabilities = calculate_class_probabilities(summaries,row)
    best_label,best_prob = None,-1
    for class_value,probability in probabilities.items():
        if best_label is None or probability>best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

def naive_bayes(train,test):
    summarize = summarize_by_class(train)
    predictions = list()
    for row in test:
        output = predict(summarize,row)
        predictions.append(output)
    return predictions

filename = 'iris.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset,i)

str_column_to_int(dataset,len(dataset[0])-1)

n_folds = 5
scores = evaluate_algorithm(dataset,naive_bayes,n_folds)
print('Scores = %s'%scores)
print('Mean Accuracy = %.3f'%(sum(scores)/float(len(scores))))
    
        
        

Scores = [86.66666666666667, 93.33333333333333, 100.0, 96.66666666666667, 96.66666666666667]
Mean Accuracy = 94.667


In [44]:
pwd


'C:\\Users\\Vishwajeet Bhoir\\Desktop\\scratch'