In [1]:
import math
from collections import defaultdict

# ----------------------------
# Naive Bayes Classifier (for categorical data)
# ----------------------------

class NaiveBayesClassifier:
    def __init__(self):
        # Count of each class label: P(class)
        self.class_counts = defaultdict(int)
        # Count of feature value given class: P(feature=value | class)
        # Structure: feature_counts[feature][value][class]
        self.feature_counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
        # All possible values for each feature (for Laplace smoothing)
        self.feature_values = defaultdict(set)
        # Set of all class labels
        self.classes = set()
        # List of feature names
        self.features = []

    def fit(self, X, y):
        """
        Train the classifier.
        X: list of dicts (feature_name -> value)
        y: list of class labels
        """
        self.features = list(X[0].keys())

        # Count classes and feature values
        for features_dict, label in zip(X, y):
            self.class_counts[label] += 1
            self.classes.add(label)
            for f in self.features:
                value = features_dict[f]
                self.feature_counts[f][value][label] += 1
                self.feature_values[f].add(value)

        # Total number of training instances
        self.total_samples = sum(self.class_counts.values())

    def _log_p_class(self, c):
        """Return log P(class=c)."""
        return math.log(self.class_counts[c] / self.total_samples)

    def _log_p_feature_given_class(self, feature, value, c):
        """
        Return log P(feature=value | class=c) with Laplace smoothing.
        """
        # Number of times feature=value occurs with this class
        count_fvc = self.feature_counts[feature][value][c]
        # Total examples of this class
        count_c = self.class_counts[c]
        # Total number of possible values of this feature
        k = len(self.feature_values[feature])

        # Laplace smoothing:
        # (count_fvc + 1) / (count_c + k)
        prob = (count_fvc + 1) / (count_c + k)
        return math.log(prob)

    def predict_one(self, x):
        """
        Predict class label for a single example x (dict of feature_name -> value).
        """
        best_class = None
        best_log_prob = -float("inf")

        for c in self.classes:
            # Start with log P(class=c)
            log_prob = self._log_p_class(c)

            # Add log P(feature=value | class=c) for each feature
            for f in self.features:
                value = x[f]
                log_prob += self._log_p_feature_given_class(f, value, c)

            # Keep the class with maximum posterior probability
            if log_prob > best_log_prob:
                best_log_prob = log_prob
                best_class = c

        return best_class

    def predict(self, X):
        """Predict class labels for a list of examples."""
        return [self.predict_one(x) for x in X]

    def accuracy(self, X, y_true):
        """Compute classification accuracy in percentage."""
        y_pred = self.predict(X)
        correct = sum(1 for yp, yt in zip(y_pred, y_true) if yp == yt)
        return (correct / len(y_true)) * 100.0


# -----------------------------------------
# Sample dataset: Play Tennis (categorical)
# -----------------------------------------

# Features: Outlook, Temperature, Humidity, Wind
# Class label: Play (Yes/No)

# Training data (10 samples)
train_X = [
    {"Outlook": "Sunny",    "Temperature": "Hot",  "Humidity": "High",   "Wind": "Weak"},
    {"Outlook": "Sunny",    "Temperature": "Hot",  "Humidity": "High",   "Wind": "Strong"},
    {"Outlook": "Overcast", "Temperature": "Hot",  "Humidity": "High",   "Wind": "Weak"},
    {"Outlook": "Rain",     "Temperature": "Mild", "Humidity": "High",   "Wind": "Weak"},
    {"Outlook": "Rain",     "Temperature": "Cool", "Humidity": "Normal", "Wind": "Weak"},
    {"Outlook": "Rain",     "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong"},
    {"Outlook": "Overcast", "Temperature": "Cool", "Humidity": "Normal", "Wind": "Strong"},
    {"Outlook": "Sunny",    "Temperature": "Mild", "Humidity": "High",   "Wind": "Weak"},
    {"Outlook": "Sunny",    "Temperature": "Cool", "Humidity": "Normal", "Wind": "Weak"},
    {"Outlook": "Rain",     "Temperature": "Mild", "Humidity": "Normal", "Wind": "Weak"}
]

train_y = [
    "No",   # 1
    "No",   # 2
    "Yes",  # 3
    "Yes",  # 4
    "Yes",  # 5
    "No",   # 6
    "Yes",  # 7
    "No",   # 8
    "Yes",  # 9
    "Yes"   # 10
]

# Test data (4 samples)
test_X = [
    {"Outlook": "Sunny",    "Temperature": "Mild", "Humidity": "Normal", "Wind": "Strong"},
    {"Outlook": "Overcast", "Temperature": "Mild", "Humidity": "High",   "Wind": "Strong"},
    {"Outlook": "Overcast", "Temperature": "Hot",  "Humidity": "Normal", "Wind": "Weak"},
    {"Outlook": "Rain",     "Temperature": "Mild", "Humidity": "High",   "Wind": "Strong"}
]

test_y = [
    "Yes",  # 11
    "Yes",  # 12
    "Yes",  # 13
    "No"    # 14
]

# ----------------------------
# Train and evaluate
# ----------------------------

nb = NaiveBayesClassifier()
nb.fit(train_X, train_y)

# Predict on test data
predictions = nb.predict(test_X)

print("Test samples and predictions:")
for i, (x, actual, pred) in enumerate(zip(test_X, test_y, predictions), start=1):
    print(f"Test {i}: {x}, Actual = {actual}, Predicted = {pred}")

# Compute and display accuracy
acc = nb.accuracy(test_X, test_y)
print(f"\nAccuracy of Naive Bayes classifier on test data = {acc:.2f}%")


Test samples and predictions:
Test 1: {'Outlook': 'Sunny', 'Temperature': 'Mild', 'Humidity': 'Normal', 'Wind': 'Strong'}, Actual = Yes, Predicted = No
Test 2: {'Outlook': 'Overcast', 'Temperature': 'Mild', 'Humidity': 'High', 'Wind': 'Strong'}, Actual = Yes, Predicted = Yes
Test 3: {'Outlook': 'Overcast', 'Temperature': 'Hot', 'Humidity': 'Normal', 'Wind': 'Weak'}, Actual = Yes, Predicted = Yes
Test 4: {'Outlook': 'Rain', 'Temperature': 'Mild', 'Humidity': 'High', 'Wind': 'Strong'}, Actual = No, Predicted = No

Accuracy of Naive Bayes classifier on test data = 75.00%
