In [6]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target



## P(C/x) = P(x/C) * P(C) / P(x)


class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.n_classes = len(self.classes)
        self.n_features = X.shape[1]
        
        # prior probabilities|   P(Class)
        self.priors = np.zeros(self.n_classes)
        for i, c in enumerate(self.classes):
            self.priors[i] = np.mean(y == c)
        
        # mean var
        self.means = np.zeros((self.n_classes, self.n_features))
        self.vars = np.zeros((self.n_classes, self.n_features))
        
        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.means[i, :] = X_c.mean(axis=0)
            self.vars[i, :] = X_c.var(axis=0)
    
    def _calculate_likelihood(self, x, mean, var): #P(X/Class)
        eps = 1e-10  
        coef = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
        exponent = -0.5 * ((x - mean) ** 2) / (var + eps) #gauss eq
        return coef * np.exp(exponent)
    
    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        
        for i, x in enumerate(X):
            posteriors = []
            
            # posterior probability for each class P(C/X) = P(X/C) * P(C) with log
            for c in range(self.n_classes):
                prior = np.log(self.priors[c])
                likelihood = np.sum(np.log(self._calculate_likelihood(x, self.means[c, :], self.vars[c, :])))
                posterior = prior + likelihood
                posteriors.append(posterior)
            
            # Select class with highest posterior probability
            y_pred[i] = self.classes[np.argmax(posteriors)]
            
        return y_pred

def k_fold_split(X, y, k=10):
    fold_size = len(X) // k
    X_folds = []
    y_folds = []
    
    indices = np.random.permutation(len(X))
    for i in range(k):
        start_idx = i * fold_size
        end_idx = start_idx + fold_size if i < k-1 else len(X)
        
        fold_indices = indices[start_idx:end_idx]
        X_folds.append(X[fold_indices])
        y_folds.append(y[fold_indices])
    
    return X_folds, y_folds

def calculate_metrics(y_true, y_pred, n_classes=3):
    conf_matrix = np.zeros((n_classes, n_classes))
    for i in range(len(y_true)):
        conf_matrix[int(y_true[i])][int(y_pred[i])] += 1
    
    correct = np.sum(y_true == y_pred)
    incorrect = len(y_true) - correct
    accuracy = correct / len(y_true)
    
    tpr = np.zeros(n_classes)
    fpr = np.zeros(n_classes)
    
    for i in range(n_classes):
        tp = conf_matrix[i][i]
        fn = np.sum(conf_matrix[i]) - tp
        fp = np.sum(conf_matrix[:, i]) - tp
        tn = np.sum(conf_matrix) - tp - fp - fn
        
        tpr[i] = tp / (tp + fn) if (tp + fn) > 0 else 0
        fpr[i] = fp / (fp + tn) if (fp + tn) > 0 else 0
    

    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    mae = np.mean(np.abs(y_true - y_pred))
    baseline_mae = np.mean(np.abs(y_true - np.mean(y_true)))
    rae = mae / baseline_mae if baseline_mae != 0 else 0
    
    expected_accuracy = 0
    for i in range(n_classes):
        row_sum = np.sum(conf_matrix[i])
        col_sum = np.sum(conf_matrix[:, i])
        expected_accuracy += (row_sum * col_sum) / np.sum(conf_matrix)**2
    
    kappa = (accuracy - expected_accuracy) / (1 - expected_accuracy)
    
    return {
        'accuracy': accuracy,
        'correct': correct,
        'incorrect': incorrect,
        'rmse': rmse,
        'rae': rae,
        'tpr': np.mean(tpr),
        'fpr': np.mean(fpr),
        'kappa': kappa,
        'confusion_matrix': conf_matrix
    }


k = 10
X_folds, y_folds = k_fold_split(X, y, k)
metrics_per_fold = []

for fold in range(k):
    X_test = X_folds[fold]
    y_test = y_folds[fold]
    
    X_train = np.concatenate([X_folds[i] for i in range(k) if i != fold])
    y_train = np.concatenate([y_folds[i] for i in range(k) if i != fold])
    
    nb = GaussianNaiveBayes()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    
    fold_metrics = calculate_metrics(y_test, y_pred)
    metrics_per_fold.append(fold_metrics)
    
    print(f"\nFold {fold + 1} Results:")
    print(f"Correctly Classified Instances: {fold_metrics['correct']}")
    print(f"Incorrectly Classified Instances: {fold_metrics['incorrect']}")
    print(f"Accuracy: {fold_metrics['accuracy']:.4f}")
    print(f"Root Mean Squared Error: {fold_metrics['rmse']:.4f}")
    print(f"Relative Absolute Error: {fold_metrics['rae']:.4f}")
    print(f"True Positive Rate: {fold_metrics['tpr']:.4f}")
    print(f"False Positive Rate: {fold_metrics['fpr']:.4f}")
    print(f"Kappa Score: {fold_metrics['kappa']:.4f}")
    print("\nConfusion Matrix:")
    print(fold_metrics['confusion_matrix'])

print("\nAverage Metrics Across All Folds:")
avg_metrics = {
    'accuracy': np.mean([m['accuracy'] for m in metrics_per_fold]),
    'rmse': np.mean([m['rmse'] for m in metrics_per_fold]),
    'rae': np.mean([m['rae'] for m in metrics_per_fold]),
    'tpr': np.mean([m['tpr'] for m in metrics_per_fold]),
    'fpr': np.mean([m['fpr'] for m in metrics_per_fold]),
    'kappa': np.mean([m['kappa'] for m in metrics_per_fold])
}

print(f"Average Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Average RMSE: {avg_metrics['rmse']:.4f}")
print(f"Average RAE: {avg_metrics['rae']:.4f}")
print(f"Average TPR: {avg_metrics['tpr']:.4f}")
print(f"Average FPR: {avg_metrics['fpr']:.4f}")
print(f"Average Kappa Score: {avg_metrics['kappa']:.4f}")


Fold 1 Results:
Correctly Classified Instances: 15
Incorrectly Classified Instances: 0
Accuracy: 1.0000
Root Mean Squared Error: 0.0000
Relative Absolute Error: 0.0000
True Positive Rate: 1.0000
False Positive Rate: 0.0000
Kappa Score: 1.0000

Confusion Matrix:
[[9. 0. 0.]
 [0. 4. 0.]
 [0. 0. 2.]]

Fold 2 Results:
Correctly Classified Instances: 15
Incorrectly Classified Instances: 0
Accuracy: 1.0000
Root Mean Squared Error: 0.0000
Relative Absolute Error: 0.0000
True Positive Rate: 1.0000
False Positive Rate: 0.0000
Kappa Score: 1.0000

Confusion Matrix:
[[5. 0. 0.]
 [0. 4. 0.]
 [0. 0. 6.]]

Fold 3 Results:
Correctly Classified Instances: 12
Incorrectly Classified Instances: 3
Accuracy: 0.8000
Root Mean Squared Error: 0.4472
Relative Absolute Error: 0.4018
True Positive Rate: 0.8056
False Positive Rate: 0.1032
Kappa Score: 0.6809

Confusion Matrix:
[[4. 0. 0.]
 [0. 6. 2.]
 [0. 1. 2.]]

Fold 4 Results:
Correctly Classified Instances: 14
Incorrectly Classified Instances: 1
Accuracy: 0.

In [7]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo

spect_heart = fetch_ucirepo(id=95)
X = spect_heart.data.features.to_numpy()
y = spect_heart.data.targets.to_numpy().ravel()

class BernoulliNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.n_classes = len(self.classes)
        self.n_features = X.shape[1]
 
        self.class_priors = {}
        for c in self.classes:
            self.class_priors[c] = np.sum(y == c) / float(len(y))
        
        self.feature_probs = {}
        for c in self.classes:
            X_c = X[y == c]

            alpha = 1.0  
            feature_counts = np.sum(X_c == 1, axis=0) + alpha
            total_samples = X_c.shape[0] + 2 * alpha
            self.feature_probs[c] = feature_counts / total_samples
    
    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        
        for i, x in enumerate(X):
            log_probs = {}
            for c in self.classes:

                log_prob = np.log(self.class_priors[c])

                for j, x_j in enumerate(x):
                    if x_j == 1:
                        log_prob += np.log(self.feature_probs[c][j])
                    else:
                        log_prob += np.log(1 - self.feature_probs[c][j])
                
                log_probs[c] = log_prob
            

            y_pred[i] = max(log_probs.items(), key=lambda x: x[1])[0]
        
        return y_pred



def calculate_metrics(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    total = len(y_true)
    
 
    correct = 0
    for i in range(total):
        if y_true[i] == y_pred[i]:
            correct += 1
    accuracy = correct / total


    TP = 0 
    TN = 0  
    FP = 0  
    FN = 0  

    for i in range(total):
        if y_true[i] == 1 and y_pred[i] == 1:
            TP += 1
        elif y_true[i] == 0 and y_pred[i] == 0:
            TN += 1
        elif y_true[i] == 0 and y_pred[i] == 1:
            FP += 1
        elif y_true[i] == 1 and y_pred[i] == 0:
            FN += 1

    if (TP + FP) > 0:
        precision = TP / (TP + FP)
    else:
        precision = 0.0

    # Recall (a.k.a. True Positive Rate): How many actual positives were correctly predicted.
    if (TP + FN) > 0:
        recall = TP / (TP + FN)
    else:
        recall = 0.0

    # F1 Score: Harmonic mean of precision and recall.
    if (precision + recall) > 0:
        f1_score = 2 * precision * recall / (precision + recall)
    else:
        f1_score = 0.0

    # True Positive Rate (TPR) is the same as recall.
    tpr = recall

    # False Positive Rate: FP / (FP + TN)
    if (FP + TN) > 0:
        fpr = FP / (FP + TN)
    else:
        fpr = 0.0

    confusion_matrix = np.array([[TN, FP], [FN, TP]])

    # kappa = (observed accuracy - expected accuracy) / (1 - expected accuracy)
    total_samples = total
    expected_accuracy = (((TN + FP) * (TN + FN)) + ((FN + TP) * (FP + TP))) / (total_samples * total_samples)
    if (1 - expected_accuracy) > 0:
        kappa = (accuracy - expected_accuracy) / (1 - expected_accuracy)
    else:
        kappa = 0.0

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'confusion_matrix': confusion_matrix,
        'kappa': kappa,
        'correct': correct,
        'incorrect': total - correct,
        'tpr': tpr,
        'fpr': fpr
    }
    return metrics


# Set up 10-fold cross-validation.
k = 10
unique_classes = np.unique(y)
class_indices = {c: np.where(y == c)[0] for c in unique_classes}

X_folds = []
y_folds = []
for c in unique_classes:
    indices = class_indices[c]
    np.random.shuffle(indices)
    fold_size = len(indices) // k
    for i in range(k):
        # Initialize the folds only once.
        if i == 0 and len(X_folds) < k:
            X_folds.extend([[] for _ in range(k)])
            y_folds.extend([[] for _ in range(k)])
        start_idx = i * fold_size
        end_idx = start_idx + fold_size if i < k - 1 else len(indices)
        fold_indices = indices[start_idx:end_idx]
        X_folds[i].extend(X[fold_indices])
        y_folds[i].extend(y[fold_indices])

X_folds = [np.array(fold) for fold in X_folds]
y_folds = [np.array(fold) for fold in y_folds]

metrics_per_fold = []

for fold in range(k):
    X_test = X_folds[fold]
    y_test = y_folds[fold]
    
    X_train = np.vstack([X_folds[i] for i in range(k) if i != fold])
    y_train = np.concatenate([y_folds[i] for i in range(k) if i != fold])
    
    nb = BernoulliNaiveBayes()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    
    fold_metrics = calculate_metrics(y_test, y_pred)
    metrics_per_fold.append(fold_metrics)
    
    print(f"\nFold {fold + 1} Results:")
    print(f"Correctly Classified Instances: {fold_metrics['correct']}")
    print(f"Incorrectly Classified Instances: {fold_metrics['incorrect']}")
    print(f"Accuracy: {fold_metrics['accuracy']:.4f}")
    print(f"Precision: {fold_metrics['precision']:.4f}")
    print(f"Recall: {fold_metrics['recall']:.4f}")
    print(f"F1 Score: {fold_metrics['f1_score']:.4f}")
    print(f"True Positive Rate: {fold_metrics['tpr']:.4f}")
    print(f"False Positive Rate: {fold_metrics['fpr']:.4f}")
    print(f"Kappa Score: {fold_metrics['kappa']:.4f}")
    print("\nConfusion Matrix:")
    print(fold_metrics['confusion_matrix'])

avg_metrics = {
    'accuracy': np.mean([m['accuracy'] for m in metrics_per_fold]),
    'precision': np.mean([m['precision'] for m in metrics_per_fold]),
    'recall': np.mean([m['recall'] for m in metrics_per_fold]),
    'f1_score': np.mean([m['f1_score'] for m in metrics_per_fold]),
    'tpr': np.mean([m['tpr'] for m in metrics_per_fold]),
    'fpr': np.mean([m['fpr'] for m in metrics_per_fold]),
    'kappa': np.mean([m['kappa'] for m in metrics_per_fold])
}

print("\nAverage Metrics Across All Folds:")
print(f"Average Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Average Precision: {avg_metrics['precision']:.4f}")
print(f"Average Recall: {avg_metrics['recall']:.4f}")
print(f"Average F1 Score: {avg_metrics['f1_score']:.4f}")
print(f"Average TPR: {avg_metrics['tpr']:.4f}")
print(f"Average FPR: {avg_metrics['fpr']:.4f}")
print(f"Average Kappa Score: {avg_metrics['kappa']:.4f}")



Fold 1 Results:
Correctly Classified Instances: 24
Incorrectly Classified Instances: 2
Accuracy: 0.9231
Precision: 1.0000
Recall: 0.9048
F1 Score: 0.9500
True Positive Rate: 0.9048
False Positive Rate: 0.0000
Kappa Score: 0.7851

Confusion Matrix:
[[ 5  0]
 [ 2 19]]

Fold 2 Results:
Correctly Classified Instances: 21
Incorrectly Classified Instances: 5
Accuracy: 0.8077
Precision: 1.0000
Recall: 0.7619
F1 Score: 0.8649
True Positive Rate: 0.7619
False Positive Rate: 0.0000
Kappa Score: 0.5517

Confusion Matrix:
[[ 5  0]
 [ 5 16]]

Fold 3 Results:
Correctly Classified Instances: 21
Incorrectly Classified Instances: 5
Accuracy: 0.8077
Precision: 0.9000
Recall: 0.8571
F1 Score: 0.8780
True Positive Rate: 0.8571
False Positive Rate: 0.4000
Kappa Score: 0.4248

Confusion Matrix:
[[ 3  2]
 [ 3 18]]

Fold 4 Results:
Correctly Classified Instances: 23
Incorrectly Classified Instances: 3
Accuracy: 0.8846
Precision: 0.9500
Recall: 0.9048
F1 Score: 0.9268
True Positive Rate: 0.9048
False Positive