In [27]:
import pandas as pd

df = pd.read_csv("../../datasets/pima_indian_diabetes_dataset/cleaned_dataset.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Diabetes Pedigree Function,Age,Outcome
0,2,108,62,32,56,25.2,0.128,21,0
1,0,137,68,14,148,24.8,0.143,21,0
2,1,89,66,23,94,28.1,0.167,21,0
3,0,139,62,17,210,22.1,0.207,21,0
4,4,99,76,15,51,23.2,0.223,21,0


In [28]:
from sklearn.model_selection import train_test_split

X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
from scipy.stats import chi2_contingency
import numpy as np

thresholds = {}

for col in X_test.columns:
    best_p = 1.0
    best_thresh = None
    values = X_test[col].values
    for thresh in np.unique(values):
        binarized = (values >= thresh).astype(int)
        contingency = pd.crosstab(binarized, y_test)
        if contingency.shape == (2, 2):  # Only test if both classes present
            _, p, _, _ = chi2_contingency(contingency)
            if p < best_p:
                best_p = p
                best_thresh = thresh
    thresholds[col] = best_thresh

for col, thresh in thresholds.items():
    print(f"{col}: threshold = {thresh}")

Pregnancies: threshold = 8
Glucose: threshold = 173
Blood Pressure: threshold = 84
Skin Thickness: threshold = 25
Insulin: threshold = 110
BMI: threshold = 27.1
Diabetes Pedigree Function: threshold = 1.034
Age: threshold = 29


In [30]:
for col in X_test.columns:
    thresh = thresholds[col]
    binarized = (X_test[col].values >= thresh).astype(int)
    print(f"{col}: shape = {binarized.shape}, counts = {np.bincount(binarized)}")

Pregnancies: shape = (79,), counts = [73  6]
Glucose: shape = (79,), counts = [69 10]
Blood Pressure: shape = (79,), counts = [65 14]
Skin Thickness: shape = (79,), counts = [28 51]
Insulin: shape = (79,), counts = [33 46]
BMI: shape = (79,), counts = [20 59]
Diabetes Pedigree Function: shape = (79,), counts = [72  7]
Age: shape = (79,), counts = [53 26]


In [31]:
class_priors = y_train.value_counts(normalize=True)
print(class_priors)

Outcome
0    0.667732
1    0.332268
Name: proportion, dtype: float64


In [32]:
# Compute likelihoods P(feature=1|class) and P(feature=0|class) for all features in X_test
likelihoods = {}

for col in X_test.columns:
    thresh = thresholds[col]
    binarized = (X_train[col].values >= thresh).astype(int)
    feature_likelihood = {}
    for cls in [0, 1]:
        cls_mask = (y_train == cls)
        total = cls_mask.sum()
        # Laplace smoothing to avoid zero probabilities
        p1 = (binarized[cls_mask].sum() + 1) / (total + 2)
        p0 = (total - binarized[cls_mask].sum() + 1) / (total + 2)
        feature_likelihood[cls] = {'P(feature=1|class)': p1, 'P(feature=0|class)': p0}
    likelihoods[col] = feature_likelihood

for col, probs in likelihoods.items():
    print(f"{col}: {probs}")

Pregnancies: {0: {'P(feature=1|class)': 0.07582938388625593, 'P(feature=0|class)': 0.9241706161137441}, 1: {'P(feature=1|class)': 0.25471698113207547, 'P(feature=0|class)': 0.7452830188679245}}
Glucose: {0: {'P(feature=1|class)': 0.02843601895734597, 'P(feature=0|class)': 0.9715639810426541}, 1: {'P(feature=1|class)': 0.2169811320754717, 'P(feature=0|class)': 0.7830188679245284}}
Blood Pressure: {0: {'P(feature=1|class)': 0.11848341232227488, 'P(feature=0|class)': 0.8815165876777251}, 1: {'P(feature=1|class)': 0.22641509433962265, 'P(feature=0|class)': 0.7735849056603774}}
Skin Thickness: {0: {'P(feature=1|class)': 0.5971563981042654, 'P(feature=0|class)': 0.4028436018957346}, 1: {'P(feature=1|class)': 0.7735849056603774, 'P(feature=0|class)': 0.22641509433962265}}
Insulin: {0: {'P(feature=1|class)': 0.45023696682464454, 'P(feature=0|class)': 0.5497630331753555}, 1: {'P(feature=1|class)': 0.839622641509434, 'P(feature=0|class)': 0.16037735849056603}}
BMI: {0: {'P(feature=1|class)': 0.7

In [33]:
sample = X_test.sample(n=1, random_state=42)
print(sample)

     Pregnancies  Glucose  Blood Pressure  Skin Thickness  Insulin   BMI  \
155            3      113              50              10       85  29.5   

     Diabetes Pedigree Function  Age  
155                       0.626   25  


In [34]:
# Print binarization thresholds
print("Binarization thresholds:")
for col, thresh in thresholds.items():
    print(f"{col}: {thresh}")

# Binarize the selected sample using the thresholds
sample_binarized = sample.copy()
for col in sample.columns:
    sample_binarized[col] = (sample[col] >= thresholds[col]).astype(int)
print("\nBinarized sample:")
print(sample_binarized)

Binarization thresholds:
Pregnancies: 8
Glucose: 173
Blood Pressure: 84
Skin Thickness: 25
Insulin: 110
BMI: 27.1
Diabetes Pedigree Function: 1.034
Age: 29

Binarized sample:
     Pregnancies  Glucose  Blood Pressure  Skin Thickness  Insulin  BMI  \
155            0        0               0               0        0    1   

     Diabetes Pedigree Function  Age  
155                           0    0  


In [35]:
import pandas as pd
from IPython.display import display

# Calculate posterior probabilities for the binarized sample, feature by feature
posteriors_table = []

for col in sample_binarized.columns:
    feature_val = sample_binarized.iloc[0][col]
    probs = likelihoods[col]
    row = {
        'Feature': col,
        'Value': feature_val,
        'P(feature=val|class=0)': probs[0][f'P(feature={feature_val}|class)'],
        'P(feature=val|class=1)': probs[1][f'P(feature={feature_val}|class)']
    }
    posteriors_table.append(row)

posteriors_df = pd.DataFrame(posteriors_table)
display(posteriors_df)

Unnamed: 0,Feature,Value,P(feature=val|class=0),P(feature=val|class=1)
0,Pregnancies,0,0.924171,0.745283
1,Glucose,0,0.971564,0.783019
2,Blood Pressure,0,0.881517,0.773585
3,Skin Thickness,0,0.402844,0.226415
4,Insulin,0,0.549763,0.160377
5,BMI,1,0.748815,0.915094
6,Diabetes Pedigree Function,0,0.943128,0.896226
7,Age,0,0.672986,0.292453


In [36]:
# Calculate posterior probabilities for the binarized sample using Naive Bayes rule
# and classify based on the higher posterior

# Start with class priors
posteriors = {}
calculation_steps = []

for cls in [0, 1]:
    posterior = class_priors[cls]
    step_info = {"Class": cls, "Prior": f"{posterior:.4f}"}
    
    for col in sample_binarized.columns:
        feature_val = sample_binarized.iloc[0][col]
        prob = likelihoods[col][cls][f'P(feature={feature_val}|class)']
        step_info[col] = f"{prob:.4f}"
        posterior *= prob
    
    step_info["Product"] = f"{posterior:.6e}"
    posteriors[cls] = posterior
    calculation_steps.append(step_info)

# Normalize to get probabilities
total_post = sum(posteriors.values())
for cls in posteriors:
    posteriors[cls] /= total_post

# Add normalized probabilities to the table
for i, cls in enumerate([0, 1]):
    calculation_steps[i]["Normalized"] = f"{posteriors[cls]:.4f}"

# Classification
predicted_class = max(posteriors, key=posteriors.get)

# Get the actual outcome for comparison
actual_class = y_test.loc[sample.index[0]]

print("Posterior probabilities:", posteriors)
print("Predicted class:", predicted_class)
print("Actual class:", actual_class)
print("Prediction correct:", predicted_class == actual_class)

# Display calculation summary
print("\nNaive Bayes Calculation Summary:")
calc_df = pd.DataFrame(calculation_steps)
display(calc_df)

Posterior probabilities: {0: 0.9770555102034504, 1: 0.0229444897965496}
Predicted class: 0
Actual class: 0
Prediction correct: True

Naive Bayes Calculation Summary:


Unnamed: 0,Class,Prior,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Diabetes Pedigree Function,Age,Product,Normalized
0,0,0.6677,0.9242,0.9716,0.8815,0.4028,0.5498,0.7488,0.9431,0.673,0.05563129,0.9771
1,1,0.3323,0.7453,0.783,0.7736,0.2264,0.1604,0.9151,0.8962,0.2925,0.001306406,0.0229


In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Predict for all samples in X_test
predictions = []
for idx, row in X_test.iterrows():
    binarized_row = (row >= pd.Series(thresholds)).astype(int)
    posteriors = {}
    for cls in [0, 1]:
        posterior = class_priors[cls]
        for col in X_test.columns:
            prob = likelihoods[col][cls][f'P(feature={binarized_row[col]}|class)']
            posterior *= prob
        posteriors[cls] = posterior
    # Normalize
    total_post = sum(posteriors.values())
    for cls in posteriors:
        posteriors[cls] /= total_post
    predicted = max(posteriors, key=posteriors.get)
    predictions.append(predicted)

# Convert predictions to Series with same index as y_test
predictions = pd.Series(predictions, index=y_test.index, name='Predicted')

# Confusion matrix and metrics
cm = confusion_matrix(y_test, predictions)
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print("Confusion Matrix:\n", cm)
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")

Confusion Matrix:
 [[50  3]
 [ 6 20]]
Accuracy: 0.8861
Precision: 0.8696
Recall: 0.7692
F1 Score: 0.8163


In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import BernoulliNB

# Fit BernoulliNB on binarized training data
bnb = BernoulliNB()
bnb.fit((X_train >= pd.Series(thresholds)).astype(int), y_train)

# Predict on binarized test data
sklearn_predictions = bnb.predict(X_test_binarized)
sklearn_acc = (sklearn_predictions == y_test).mean()

# Compute metrics for scikit-learn BernoulliNB predictions
cm_sklearn = confusion_matrix(y_test, sklearn_predictions)
acc_sklearn = accuracy_score(y_test, sklearn_predictions)
prec_sklearn = precision_score(y_test, sklearn_predictions)
rec_sklearn = recall_score(y_test, sklearn_predictions)
f1_sklearn = f1_score(y_test, sklearn_predictions)

print("Confusion Matrix (scikit-learn):\n", cm_sklearn)
print(f"Accuracy: {acc_sklearn:.4f}")
print(f"Precision: {prec_sklearn:.4f}")
print(f"Recall: {rec_sklearn:.4f}")
print(f"F1 Score: {f1_sklearn:.4f}")

Confusion Matrix (scikit-learn):
 [[50  3]
 [ 6 20]]
Accuracy: 0.8861
Precision: 0.8696
Recall: 0.7692
F1 Score: 0.8163


In [39]:
from sklearn.naive_bayes import GaussianNB

# Fit GaussianNB on the original (non-binarized) training data
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict on the original test data
gnb_predictions = gnb.predict(X_test)

# Compute metrics for GaussianNB predictions
cm_gnb = confusion_matrix(y_test, gnb_predictions)
acc_gnb = accuracy_score(y_test, gnb_predictions)
prec_gnb = precision_score(y_test, gnb_predictions)
rec_gnb = recall_score(y_test, gnb_predictions)
f1_gnb = f1_score(y_test, gnb_predictions)

print("Confusion Matrix (GaussianNB):\n", cm_gnb)
print(f"Accuracy: {acc_gnb:.4f}")
print(f"Precision: {prec_gnb:.4f}")
print(f"Recall: {rec_gnb:.4f}")
print(f"F1 Score: {f1_gnb:.4f}")

Confusion Matrix (GaussianNB):
 [[46  7]
 [ 9 17]]
Accuracy: 0.7975
Precision: 0.7083
Recall: 0.6538
F1 Score: 0.6800
