In [None]:
# Breast Cancer classification using Naive Bayes

In [None]:
from csv import reader


# Load a CSV file
def load_csv(filename):
  dataset = []
  with open(filename, "r") as file:
    csv_reader = reader(file)
    for row in csv_reader:
      dataset.append([float(value) for value in row])
  return dataset


In [None]:
from math import sqrt, exp, pi


# Calculate the mean of a list of numbers
def mean(numbers):
  return sum(numbers) / float(len(numbers))


# Calculate the standard deviation of a list of numbers
def stdev(numbers):
  avg = mean(numbers)
  variance = sum(
      [(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
  return sqrt(variance)


# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
  exponent = exp(-((x - mean) ** 2 / (2 * stdev**2)))
  return (1 / (sqrt(2 * pi) * stdev)) * exponent


In [None]:
from collections import defaultdict


# Split the dataset by class values, returns a dictionary
# where the keys are the classes and the values are the
# rows of the respective class.
def separate_by_class(dataset):
  separated = defaultdict(lambda: [])
  for row in dataset:
    class_value = row[-1]
    separated[class_value].append(row[:-1])
  return separated


In [None]:
# Calculate the mean, stdev and count for each column in a dataset.
def summarize_dataset(dataset):
  summaries = [{
      "mean": mean(column),
      "stdev": stdev(column),
      "len": len(column),
  } for column in zip(*dataset)]

  return summaries


# Split dataset by class then calculate statistics for each row.
def summarize_by_class(dataset):
  separated = separate_by_class(dataset)
  for i in range(len(separated)):
    separated[i] = summarize_dataset(separated[i])
  return separated


In [None]:
# Data modified from
# https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
dataset = load_csv("data.csv")

train_data = dataset[:400]
test_data = dataset[400:]


In [None]:
# Fit the model.
model = summarize_by_class(train_data)


In [None]:
# Calculate the probabilities of predicting each class for a given row.
def calculate_class_probabilities(summaries, row):
  total_rows = sum([summaries[label][0]["len"] for label in summaries])
  probabilities = {}
  for class_value, class_summaries in summaries.items():
    # The proportion between the number of rows that
    # have this class and the total number of rows.
    # This is the start probability.
    probabilities[class_value] = class_summaries[0]["len"] / total_rows
    for i, summary in enumerate(class_summaries):
      # Calculate the probability for each attribute.
      probabilities[class_value] *= calculate_probability(
          row[i], summary["mean"], summary["stdev"])
  return probabilities


# Predict the class for a given row.
def predict(summaries, row):
  probabilities = calculate_class_probabilities(summaries, row)
  best_label = max(probabilities, key=probabilities.get)
  return best_label


In [None]:
correct = 0

for data in test_data:
  if predict(model, data[:-1]) == data[-1]:
    correct += 1

accuracy = correct / len(test_data)

print(f"Accurary = {accuracy}")
