In [7]:
from math import sqrt, pi, exp, floor
from statistics import mean, stdev
from files import readFile
import random

def separate_by_class(dataset):
	separated = {}
	for d in dataset:
		separated.setdefault(d[-1], []).append(d)
	return separated

def summarize_dataset(dataset):
	return [(mean(column), stdev(column), len(column)) for column in list(zip(*dataset))[:-1]]

def summarize_by_class(dataset):
	return {key: summarize_dataset(value) for key, value in separate_by_class(dataset).items()}

def calculate_probability(x, mean, stdev):
	exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
	return (1 / (sqrt(2 * pi) * stdev)) * exponent
 
def calculate_class_probabilities(summaries, test_row):
	total_rows = sum([summaries[label][0][2] for label in summaries])
	probabilities = {}
	for key, value in summaries.items():
		probabilities[key] = summaries[key][0][2]/total_rows
		for i, v in enumerate(value):
			probabilities[key] *= calculate_probability(test_row[i], v[0], v[1])
	return probabilities

dataset = readFile("Breast_cancer_data.csv")

errors = []
for i in range(5):
	random.shuffle(dataset)
	x = floor(len(dataset) * 0.8)
	train = dataset[:x]
	test = dataset[x:]
	error = 0
	for t in test:
		summaries = summarize_by_class(train)
		probabilities = calculate_class_probabilities(summaries, t)
		print("Probabilities:", probabilities)
		result = max(probabilities, key=probabilities.get)
		print("Result:", result)
		if result != t[-1]:
			error += 1
	errors.append(error / len(test))
print("Errors average:", mean(errors))

0.1087719298245614
