In [17]:
import numpy as np
import pandas as pd
from csv import reader
from random import seed
from random import randrange
import PriorUtils as pu
import CorrectnessMetricUtils as cmu
import ErrorMetricsUtils as emu

# Get CSV file
def get_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        data = reader(file)
        for row in data:
            if not row:
                continue
            dataset.append(row)
    return dataset

# String to float columnwise
def str_to_float_col(dataset, col):
    for row in dataset:
        row[col] = float(row[col].strip())
        
# Split dataset into n folds
def crossval_split(dataset, n_folds):
    split = list()
    dataset_copy = list(dataset)
    fold_dim = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_dim:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        split.append(fold)
    return split

# Divide dataset by class
def class_divider(dataset):
    divided = dict()
    for i in range(len(dataset)):
        row = dataset[i]
        class_type = row[-1]
        if (class_type not in divided):
            divided[class_type] = list()
        divided[class_type].append(row)
    return divided

# Mean, stde and count columnwise
def dataset_info(dataset):
    info = [(np.mean(col), np.std(col), len(col)) for col in zip(*dataset)]
    del(info[-1]) #not reqd for class labels
    return info

# Classwise column stats
def class_info(dataset):
    divided = class_divider(dataset)
    info = dict()
    for class_type, rows in divided.items():
        info[class_type] = dataset_info(rows)
    return info

# Calculate probabilities of predicting each class for given row
def calc_class_probs(info, row, prior):
    total_rows = sum([info[label][0][2] for label in info])
    probs = dict()
    for class_type, class_info in info.items():
        probs[class_type] = info[class_type][0][2]/float(total_rows)
        for i in range(len(class_info)):
            mean, std, _ = class_info[i]
            probs[class_type] *= prior(row[i], mean, std)
    return probs

# Predict class type for given row
def predict(info, row, prior):
    probs = calc_class_probs(info, row, prior)
    best_label, best_prob = None, -1
    for class_type, prob in probs.items():
        if best_label is None or prob > best_prob:
            best_prob = prob
            best_label = class_type
    return best_label

# Algo evaluation by cross validation split
def eval_algo(dataset, algo, n_folds, obs_label, *args):
    folds = crossval_split(dataset, n_folds)
    TestScores = list()
    TrainScores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        test_pred = algo(train_set, test_set, *args)
        train_pred = algo(train_set, train_set, *args)
        test_actual = [row[-1] for row in fold]
        train_actual = [row[-1] for row in train_set]
        test_accuracy = cmu.accuracy_calc(test_actual, test_pred)
        train_accuracy = cmu.accuracy_calc(train_actual, train_pred)
        TestScores.append(test_accuracy)
        TrainScores.append(train_accuracy)
    return TestScores, TrainScores

# Naive Bayes Algorithm simple gaussian
def naive_bayes_Gaussian(train, test):
    info = class_info(train)
    predictions = list()
    for row in test:
        output = predict(info, row, pu.Gaussian)
        predictions.append(output)
    return(predictions)

# Judgement
seed(1)
filename = 'bc.csv'
dataset = get_csv(filename)
dataset.remove(dataset[0])
for i in range(len(dataset[0])):
    str_to_float_col(dataset, i)

In [18]:
# evaluate naive bayes (gaussian) algorithm
n_folds = 5
TestScores, TrainScores = eval_algo(dataset, naive_bayes_Gaussian, n_folds, 1)
print('Test Accuracies: %s' % TestScores)
print('Mean Test Accuracy: %s' % (sum(TestScores)/float(len(TestScores))))
print('Test Accuracies: %s' % TestScores)
print('Mean Test Accuracy: %s' % (sum(TestScores)/float(len(TestScores))))

Ascores: [0.8928571428571429, 0.8071428571428572, 0.8714285714285714, 0.8571428571428571, 0.85]
Mean Accuracy: 0.8557142857142856
