In [1]:
from csv import reader
from math import sqrt, exp, pi
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        next(csv_reader)  # Skip the header row
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row[1:])  # Exclude the first column (Id)
    return dataset


In [3]:
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())


In [4]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

In [5]:
# Split the dataset by class values, returns a dictionary
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if class_value not in separated:
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers) / float(len(numbers))


In [6]:
# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x - avg) ** 2 for x in numbers]) / float(len(numbers) - 1)
    return sqrt(variance)

# Calculate the mean, stdev and count for each column in a dataset
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column)) for column in zip(*dataset)]
    del (summaries[-1])
    return summaries

In [7]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries, row):
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summaries)):
            mean, stdev = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

In [8]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

# Evaluate the model
def evaluate_model(actual, predicted):
    accuracy = accuracy_score(actual, predicted)
    recall = recall_score(actual, predicted, average='weighted')
    precision = precision_score(actual, predicted, average='weighted')
    f1 = f1_score(actual, predicted, average='weighted')
    
    print("Accuracy:", accuracy)
    print("Recall:", recall)
    print("Precision:", precision)
    print("F1-score:", f1)
    print("\nConfusion Matrix:")
    print(confusion_matrix(actual, predicted))
    print("\nClassification Report:")
    print(classification_report(actual, predicted))



In [9]:
# Make a prediction with Naive Bayes on Iris Dataset
filename = 'iris.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0]) - 1):
    str_column_to_float(dataset, i)
# convert class column to integers
str_column_to_int(dataset, len(dataset[0]) - 1)
# fit model
model = summarize_by_class(dataset)
# define a new record
row = [5.7, 2.9, 4.2, 1.3]

# Check if the length of the provided row matches the number of features in the dataset
if len(row) != len(dataset[0]) - 1:
    print("Error: The length of the provided row does not match the number of features in the dataset.")
    print("Length of row:", len(row))
    print("Number of features in the dataset:", len(dataset[0]) - 1)
else:
    # predict the label
    label = predict(model, row)
    print('Data=%s, Predicted: %s' % (row, label))

    # Prepare actual and predicted classes for evaluation
    actual = [row[-1] for row in dataset]
    predicted = [predict(model, row[:-1]) for row in dataset]
    # Evaluate the model
    evaluate_model(actual, predicted)


Data=[5.7, 2.9, 4.2, 1.3], Predicted: 1
Accuracy: 0.96
Recall: 0.96
Precision: 0.96
F1-score: 0.96

Confusion Matrix:
[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.94      0.94      0.94        50
           2       0.94      0.94      0.94        50

    accuracy                           0.96       150
   macro avg       0.96      0.96      0.96       150
weighted avg       0.96      0.96      0.96       150

