# Naive Bayes Classifier

In [210]:
from math import log, sqrt, pi
import numpy as np
import pandas as pd

class NaiveBayesClassifier:
    def __init__(self, alpha=1e-6):
        self.alpha = alpha
        self.label_probs = {}
        self.cond_probs = {}
        self.mean_store = {}
        self.std_store = {}
        self.is_continuous_map = {}
        self.labels = []
        self.attributes = []

    def _is_continuous(self, X: pd.Series) -> bool:
        return np.issubdtype(X.dtype, np.number) and (len(X.unique()) / len(X) > 0.01)

    def _log_gaussian(self, x, mean, std):
        if std < 1e-9:
            return 0.0 if abs(x - mean) < 1e-9 else -np.inf
        coeff = -0.5 * log(2 * pi) - log(std)
        exponent = -((x - mean) ** 2) / (2 * std ** 2)
        return coeff + exponent

    def fit(self, X_train: pd.DataFrame, y_train: pd.DataFrame):
        self.attributes = list(X_train.columns)
        self.labels = y_train.iloc[:, 0].unique()
        total_rows = len(y_train)

        # Identify continuous/discrete attributes
        for attr in self.attributes:
            self.is_continuous_map[attr] = self._is_continuous(X_train[attr])

        # Compute priors and conditional distributions
        for label in self.labels:
            label_mask = y_train.iloc[:, 0] == label
            label_X = X_train[label_mask]
            count = label_mask.sum()
            self.label_probs[label] = log((count + self.alpha) / (total_rows + self.alpha * len(self.labels)))

            for attr in self.attributes:
                if self.is_continuous_map[attr]:
                    mean = label_X[attr].mean()
                    std = max(label_X[attr].std(ddof=0), 1e-3)
                    self.mean_store[(attr, label)] = mean
                    self.std_store[(attr, label)] = std
                else:
                    full_domain = X_train[attr].unique()
                    value_counts = label_X[attr].value_counts()
                    total = len(label_X)
                    for val in full_domain:
                        count = value_counts.get(val, 0)
                        prob = (count + self.alpha) / (total + self.alpha * len(full_domain))
                        self.cond_probs[(attr, val, label)] = log(prob)

    def predict_single(self, row: pd.Series):
        class_log_probs = {}
        for label in self.labels:
            log_prob = self.label_probs[label]
            for attr in self.attributes:
                val = row[attr]
                if self.is_continuous_map[attr]:
                    mean = self.mean_store.get((attr, label), 0.0)
                    std = self.std_store.get((attr, label), 1e-3)
                    log_prob += self._log_gaussian(val, mean, std)
                else:
                    log_prob += self.cond_probs.get((attr, val, label), log(self.alpha))
            class_log_probs[label] = log_prob
        return str(max(class_log_probs, key=class_log_probs.get))

    def predict(self, X_test: pd.DataFrame):
        return [self.predict_single(row) for _, row in X_test.iterrows()]


# Decision Tree

In [None]:

def _is_continous(X: pd.Series) -> bool:
    return np.issubdtype(X.dtype, np.number) and (len(X.unique()) / len(X) > 0.01)

        

class DiscreteAttributeSelectionCriteria:
    
    def __init__(self, value):
        self.value = value 
        
    def condition_satisfied(self, val):
        if self.value == None:
            raise ValueError(f"value not set")
        
        if self.value == val:
            return True
        else:
            return False
        
        
        
class ContinuousAtrributeSelectionCriteria:
    
    def __init__(self, start_point, end_point):
        self.start_point = start_point
        self.end_point = end_point
    
    def condition_satisfied(self, val):
        if self.start_point == None or self.end_point==None:
            raise ValueError(f"value not set")
        
        if self.start_point <= val <= self.end_point:
            return True
        else:
            return False



In [None]:
from math import log2


def info(D):
    label_counts = D.iloc[:, -1].value_counts().to_dict()
    info_val = 0
    for label in label_counts:
        pi = label_counts[label] / len(D)
        info_val += - pi * log2(pi)
    return info_val

def info_A(D, attr):
    info_A = 0
    attr_values = D[attr].unique()
    for attr_val in attr_values:
        Dj = D[D[attr] == attr_val]
        info_A += (len(Dj) / len(D)) * info(Dj)
    return info_A

def Gain(D, A):
    return info(D) - info_A(D, A)


In [None]:

from platform import node


class Node:
    def __init__(self):
        self.children = {}
        self.isLeaf = False
        self.split_attribute = None  
        self.returning_class = None
        self.attribute_selection_criteria = None


class DecisionTreeClassifier:
    def __init__(self, multiple_splits_allowed=True):
        self.root = None
        self.multiple_splits_allowed = multiple_splits_allowed

    def attribute_selection(self, D, attribute_list):
        best_gain = -1
        best_attr = None
        best_criterion = None

        for attr in attribute_list:
            
            if len(D[attr].unique()) <= 1:
                continue

            if _is_continous(D[attr]):
                sorted_vals = np.sort(D[attr].dropna().unique())
                split_points = [(sorted_vals[i] + sorted_vals[i+1]) / 2 for i in range(len(sorted_vals)-1)]

                for split in split_points:
                    D_left = D[D[attr] <= split]
                    D_right = D[D[attr] > split]
                    if len(D_left) == 0 or len(D_right) == 0:
                        continue
                    weighted_info = (len(D_left)/len(D)) * info(D_left) + (len(D_right)/len(D)) * info(D_right)
                    gain = info(D) - weighted_info
                    if gain > best_gain:
                        best_gain = gain
                        best_attr = attr
                        best_criterion = [split]  # Store best split point
            else:
                gain = Gain(D, attr)
                if gain > best_gain:
                    best_gain = gain
                    best_attr = attr
                    best_criterion = D[attr].unique()

        return best_criterion, best_attr

    def build_tree(self, X_train, y_train):
        # Combine features and labels into one DataFrame
        D = pd.concat([X_train, y_train], axis=1)
        attribute_list = set(X_train.columns)

        def generate_decision_tree(D, attribute_list):
            node = Node()

            # Stopping condition 1: All samples have the same label
            if len(D.iloc[:, -1].unique()) == 1:
                node.isLeaf = True
                node.returning_class = D.iloc[:, -1].iloc[0]
                return node

            # Stopping condition 2: No attributes left to split
            if len(attribute_list) == 0:
                node.isLeaf = True
                majority_class = D.iloc[:, -1].value_counts().idxmax()
                node.returning_class = majority_class
                return node

            best_criterion, best_attr = self.attribute_selection(D, attribute_list)
            
            node.split_attribute = best_attr  # Store the attribute name
            
            if not self.multiple_splits_allowed:
                attribute_list = attribute_list - {best_attr}

            # If no attribute gives positive gain, make leaf node with majority class
            if best_attr is None or len(best_criterion) == 0:
                node.isLeaf = True
                node.returning_class = D.iloc[:, -1].value_counts().idxmax()
                return node

            
            if _is_continous(D[best_attr]):
                split = best_criterion[0]
                
                node.attribute_selection_criteria = ContinuousAtrributeSelectionCriteria(start_point=-float('inf'), end_point=split)
                
                D_left = D[D[best_attr] <= split]
                D_right = D[D[best_attr] > split]
                
                if len(D_left) == 0 or len(D_right) == 0:
                    node.isLeaf = True
                    node.returning_class = D.iloc[:, -1].value_counts().idxmax()
                    return node


                node.children['left'] = generate_decision_tree(D_left, attribute_list.copy())
                node.children['right'] = generate_decision_tree(D_right, attribute_list.copy())
            else:
                node.attribute_selection_criteria = DiscreteAttributeSelectionCriteria(value=best_criterion[0])
            # If multiple splits are NOT allowed, remove the chosen attribute
                

                # Split dataset by each attribute value and recurse
                for attr_val in best_criterion:
                    D_j = D[D[best_attr] == attr_val]

                    # If no samples in this subset, create leaf with majority class of parent
                    if len(D_j) == 0:
                        leaf_node = Node()
                        leaf_node.isLeaf = True
                        leaf_node.returning_class = D.iloc[:, -1].value_counts().idxmax()
                        node.children[attr_val] = leaf_node
                    else:
                        node.children[attr_val] = generate_decision_tree(D_j, attribute_list.copy())

            return node

        self.root = generate_decision_tree(D, attribute_list)

    def _majority_class(self, node):
        from collections import Counter

        def collect_leaf_classes(n):
            if n.isLeaf:
                return [n.returning_class]
            labels = []
            for child in n.children.values():
                labels.extend(collect_leaf_classes(child))
            return labels

        leaf_classes = collect_leaf_classes(node)
        if not leaf_classes:
            return None
        return Counter(leaf_classes).most_common(1)[0][0]

    def predict_single(self, x):
        node = self.root
        while not node.isLeaf:
            attr = node.split_attribute  
            val = x[attr]                

            if isinstance(node.attribute_selection_criteria, ContinuousAtrributeSelectionCriteria):
                if node.attribute_selection_criteria.condition_satisfied(val):
                    node = node.children['left']
                else:
                    node = node.children['right']
            elif isinstance(node.attribute_selection_criteria, DiscreteAttributeSelectionCriteria):
                if node.attribute_selection_criteria.condition_satisfied(val):
                    node = node.children[val]
                else:
                    return self._majority_class(node)

        return node.returning_class


    def predict(self, X_test):
        predictions = [self.predict_single(row) for _, row in X_test.iterrows()]
        return pd.Series(predictions, index=X_test.index)

# Comparisons

In [None]:
from ucimlrepo import fetch_ucirepo 


def __train_test_split(X, y, test_size = 0.2, shuffle_and_stratify = True):
    
    if test_size < 0 or test_size > 1:
        raise ValueError("test_size must be between 0 and 1")
   
    if len(X) != len(y):
        raise ValueError("Features and targets must have the same length.")

    
    if shuffle_and_stratify == False:
    
        train_size = 1 - test_size
        train_index = int(len(X) * train_size)
        
        X_train = X[0: train_index]
        X_test = X[train_index:]
        
        y_train = y[0: train_index]
        y_test = y[train_index:]
        
        return X_train, X_test, y_train, y_test
    else:
        labels = y.iloc[:,0].unique()
        X_train = pd.DataFrame(columns=X.columns)
        y_train = pd.DataFrame(columns=y.columns)
        X_test = pd.DataFrame(columns=X.columns)
        y_test = pd.DataFrame(columns=y.columns)
        
        train_size = 1 - test_size
        

        for label in labels :
            y_rows = y[y.iloc[:,0] == label]            
            X_rows = X.loc[y_rows.index]
            
            train_index = int(len(X_rows) * train_size)
            
            X_train = pd.concat([X_train, X_rows.iloc[:train_index]], ignore_index=False)
            y_train = pd.concat([y_train, y_rows.iloc[:train_index]] , ignore_index=False)
            
            X_test = pd.concat([X_test, X_rows[train_index:]], ignore_index=False)
            y_test = pd.concat([y_test, y_rows[train_index:]], ignore_index=False)

        return X_train, X_test, y_train, y_test
    

In [215]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np

from types import SimpleNamespace

ucirepo_ids = {
    "iris": 53,
    "heart_disease": 45,
    "molecular_biology": 69,
    "breast_cancer": 17,
    "adult": 2,
    "bank_marketing": 222,
    "wine": 109,
    "mushroom": 73,
    "solar_flare": 89,
    "tic-tac-toe":101
}


DATASET_NAME = "tic-tac-toe"  # Example dataset name
TEST_SIZE = 0.2


def fetch_dataframe(dataframe_name):
    
    if dataframe_name in ucirepo_ids:
        # fetch dataset 
        df = fetch_ucirepo(id=ucirepo_ids[dataframe_name],) 

        # # data (as pandas dataframes) 
        X = df.data.features 
        y = df.data.targets 
        
        # metadata 
        print(df.metadata) 
        
        # variable information 
        print(df.variables) 
        
        return df
    else:
        raise ValueError(f"Dataset '{dataframe_name}' not found in UCI repository.")




df = fetch_dataframe(DATASET_NAME)

X = df.data.features
y = df.data.targets

X_train, X_test, y_train, y_test = __train_test_split(X, y , test_size=TEST_SIZE, shuffle_and_stratify=True)

naive_bayes_classifier = NaiveBayesClassifier(alpha=1)
decision_tree_classifier = DecisionTreeClassifier(multiple_splits_allowed=True)

naive_bayes_classifier.fit(X_train, y_train)
decision_tree_classifier.build_tree(X_train, y_train)

y_pred_nb = naive_bayes_classifier.predict(X_test)
y_pred_dt = decision_tree_classifier.predict(X_test)



{'uci_id': 101, 'name': 'Tic-Tac-Toe Endgame', 'repository_url': 'https://archive.ics.uci.edu/dataset/101/tic+tac+toe+endgame', 'data_url': 'https://archive.ics.uci.edu/static/public/101/data.csv', 'abstract': 'Binary classification task on possible configurations of tic-tac-toe game', 'area': 'Games', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 958, 'num_features': 9, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1991, 'last_updated': 'Mon Aug 19 1991', 'dataset_doi': '10.24432/C5688J', 'creators': ['David Aha'], 'intro_paper': None, 'additional_info': {'summary': 'This database encodes the complete set of possible board configurations at the end of tic-tac-toe games, where "x" is assumed to have played first.  The target concept is "win for x" (i.e., true when "x" has one of 8 possible ways to create a "three

In [None]:
from prettytable import PrettyTable
import pandas as pd
from sklearn.metrics import accuracy_score

def manual_metrics(y_true, y_pred, label):
    P = N = TP = FP = TN = FN = 0

    for i in range(len(y_pred)):
        true_label = y_true.iloc[i] if hasattr(y_true, 'iloc') else y_true[i]
        pred_label = y_pred.iloc[i] if hasattr(y_pred, 'iloc') else y_pred[i]

        if true_label == label:
            P += 1
        else:
            N += 1

        if true_label == label and pred_label == label:
            TP += 1
        elif true_label == label and pred_label != label:
            FN += 1
        elif true_label != label and pred_label != label:
            TN += 1
        else:
            FP += 1
            


    accuracy = (TP + TN) / (P + N) if (P + N) > 0 else 0.0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall    = TP / P if P > 0 else 0.0
    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
    f1_score  = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    auc       = (recall + specificity) / 2
    support   = P

    return accuracy, precision, recall, f1_score, specificity, auc, support



def compare_classifiers_manual(y_true, y_pred_nb, y_pred_dt, dataset_name="dataset"):
    if isinstance(y_true, pd.DataFrame):
        y_true = y_true.iloc[:, 0]

    if isinstance(y_pred_nb, list):
        y_pred_nb = pd.Series(y_pred_nb)
    if isinstance(y_pred_dt, list):
        y_pred_dt = pd.Series(y_pred_dt)

    labels = sorted(pd.Series(y_true).unique())

    table = PrettyTable()
    table.field_names = [
        "Label",
        "NB Acc", "NB Prec", "NB Rec", "NB F1", "NB Spec", "NB AUC",
        "DT Acc", "DT Prec", "DT Rec", "DT F1", "DT Spec", "DT AUC",
        "Support"
    ]

    for label in labels:
        nb_acc, nb_prec, nb_rec, nb_f1, nb_spec, nb_auc, support = manual_metrics(y_true, y_pred_nb, label)
        dt_acc, dt_prec, dt_rec, dt_f1, dt_spec, dt_auc, _       = manual_metrics(y_true, y_pred_dt, label)
        
        
        table.add_row([
            label,
            nb_acc, nb_prec, nb_rec, nb_f1, nb_spec, nb_auc,
            dt_acc, dt_prec, dt_rec, dt_f1, dt_spec, dt_auc,
            support
        ])


    # Add overall accuracy
    nb_overall_acc = accuracy_score(y_true, y_pred_nb)
    dt_overall_acc = accuracy_score(y_true, y_pred_dt)
    table.add_row(["-" * 6] * len(table.field_names))  # separator row
    table.add_row([
        "OVERALL",
        round(nb_overall_acc, 3), "", "", "", "", "",
        round(dt_overall_acc, 3), "", "", "", "", "",
        len(y_true)
    ])

    print(table)

    # Save to .txt file
    save_path = f"{dataset_name}_comparison_report.txt"
    with open(save_path, "w") as f:
        f.write(str(table))
    print(f"✅ Comparison report saved to {save_path}")

    return table


report = compare_classifiers_manual(y_test, y_pred_nb, y_pred_dt, dataset_name=DATASET_NAME)
# print(report)

print(f"Naive Bayes Accuracy: {accuracy_score(y_test, y_pred_nb):.3f}")
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_dt):.3f}")

+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+---------------------+---------------------+--------------------+---------+
|  Label   |       NB Acc       |      NB Prec       |       NB Rec       |       NB F1        |      NB Spec       |       NB AUC       |       DT Acc       |       DT Prec       |       DT Rec       |        DT F1        |       DT Spec       |       DT AUC       | Support |
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+--------------------+---------------------+---------------------+--------------------+---------+
| negative | 0.7046632124352331 | 0.5892857142857143 | 0.4925373134328358 | 0.5365853658536586 | 0.8174603174603174 | 0.6549988154465767 | 0.7196152333399539 | 0.3509