In [1]:
import numpy as np
import pandas as pd
from csv import reader
from random import seed
from random import randrange
import PriorUtils as pu
import CorrectnessMetricUtils as cmu
import ErrorMetricsUtils as emu

# Get CSV file
def get_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        data = reader(file)
        for row in data:
            if not row:
                continue
            dataset.append(row)
    return dataset

# String to float columnwise
def str_to_float_col(dataset, col):
    for row in dataset:
        row[col] = float(row[col].strip())
        
# Split dataset into n folds
def crossval_split(dataset, n_folds):
    split = list()
    dataset_copy = list(dataset)
    fold_dim = int(len(dataset) / n_folds)
    for _ in range(n_folds):
        fold = list()
        while len(fold) < fold_dim:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        split.append(fold)
    return split

# Divide dataset by class
def class_divider(dataset):
    divided = dict()
    for i in range(len(dataset)):
        row = dataset[i]
        class_type = row[-1]
        if (class_type not in divided):
            divided[class_type] = list()
        divided[class_type].append(row)
    return divided

# Mean, stde and count columnwise
def dataset_info(dataset):
    info = [(np.mean(col), np.std(col), len(col)) for col in zip(*dataset)]
    del(info[-1]) #not reqd for class labels
    return info

# Classwise column stats
def class_info(dataset):
    divided = class_divider(dataset)
    info = dict()
    for class_type, rows in divided.items():
        info[class_type] = dataset_info(rows)
    return info

# Calculate probabilities of predicting each class for given row
def calc_class_probs(info, row):
    total_rows = sum([info[label][0][2] for label in info])
    probs = dict()
    for class_type, class_info in info.items():
        probs[class_type] = info[class_type][0][2]/float(total_rows)
        for i in range(len(class_info)):
            mean, std, _ = class_info[i]
            probs[class_type] *= pu.Gaussian(row[i], mean, std)
    return probs

# Predict class type for given row
def predict(info, row):
    probs = calc_class_probs(info, row)
    best_label, best_prob = None, -1
    for class_type, prob in probs.items():
        if best_label is None or prob > best_prob:
            best_prob = prob
            best_label = class_type
    return best_label

# Algo evaluation by cross validation split
def eval_algo(dataset, algo, n_folds, obs_label, *args):
    folds = crossval_split(dataset, n_folds)
    Ascores = list()
    Pscores = list()
    Rscores = list()
    Fscores = list()
    Sscores = list()
    for fold in folds:
        train_set = list(folds)
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None
        predicted = algo(train_set, test_set, *args)
        actual = [row[-1] for row in fold]
        accuracy = cmu.accuracy_calc(actual, predicted)
        precision = cmu.precision_calc(obs_label, actual, predicted)
        recall = cmu.recall_calc(obs_label, actual, predicted)
        f1 = cmu.f1_calc(obs_label, actual, predicted)
        spec = cmu.specificity_calc(obs_label, actual, predicted)
        Ascores.append(accuracy)
        Pscores.append(precision)
        Rscores.append(recall)
        Fscores.append(f1)
        Sscores.append(spec)
    return Ascores, Pscores, Rscores, Fscores, Sscores

# Naive Bayes Algorithm
def naive_bayes(train, test):
    info = class_info(train)
    predictions = list()
    for row in test:
        output = predict(info, row)
        predictions.append(output)
    return(predictions)

# Test Naive Bayes
seed(1)
filename = 'bc.csv'
dataset = get_csv(filename)
dataset.remove(dataset[0])
for i in range(len(dataset[0])):
    str_to_float_col(dataset, i)

# class clo values as int or float?
    
# evaluate algorithm
n_folds = 5
Ascores, Pscores, Rscores, Fscores, Sscores = eval_algo(dataset, naive_bayes, n_folds, 1)
print('Ascores: %s' % Ascores)
print('Pscores: %s' % Pscores)
print('Rscores: %s' % Rscores)
print('F1scores: %s' % Fscores)
print('Sscores: %s' % Sscores)
print('Mean Accuracy: %s' % (sum(Ascores)/float(len(Ascores))))
print('Mean Precision: %s' % (sum(Pscores)/float(len(Pscores))))
print('Mean Recall: %s' % (sum(Rscores)/float(len(Rscores))))
print('Mean F1 score: %s' % (sum(Fscores)/float(len(Fscores))))
print('Mean specificity: %s' % (sum(Sscores)/float(len(Sscores))))

Ascores: [0.8928571428571429, 0.8071428571428572, 0.8714285714285714, 0.8571428571428571, 0.85]
Pscores: [0.9285714285714286, 0.7547169811320755, 0.7719298245614035, 0.8253968253968254, 0.8448275862068966]
Rscores: [0.8666666666666667, 0.7407407407407407, 0.8979591836734694, 0.8524590163934426, 0.8032786885245902]
F1scores: [0.896551724137931, 0.7476635514018692, 0.8301886792452831, 0.8387096774193549, 0.823529411764706]
Sscores: [0.9230769230769231, 0.8488372093023255, 0.8571428571428571, 0.8607594936708861, 0.8860759493670886]
Mean Accuracy: 0.8557142857142856
Mean Precision: 0.8250885291737259
Mean Recall: 0.832220859199782
Mean F1 score: 0.8273286087938289
Mean specificity: 0.8751784865120161
