In [37]:
import pandas as pd

df = pd.read_csv("../../datasets/pima_indian_diabetes_dataset/cleaned_dataset.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Diabetes Pedigree Function,Age,Outcome
0,0,129,110,46,130,67.1,0.319,26,1
1,0,180,78,63,14,59.4,2.42,25,1
2,3,123,100,35,240,57.3,0.88,22,0
3,1,88,30,42,99,55.0,0.496,26,1
4,0,162,76,56,100,53.2,0.759,25,1


In [38]:
from sklearn.model_selection import train_test_split

X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save X_train and y_train merged as one CSV
train = X_train.copy()
train['Outcome'] = y_train
train.to_csv("train.csv", index=True)

# Save X_test and y_test merged as one CSV
test = X_test.copy()
test['Outcome'] = y_test
test.to_csv("test.csv", index=True)


PermissionError: [Errno 13] Permission denied: 'train.csv'

In [None]:
from scipy.stats import chi2_contingency
import numpy as np

thresholds = {}

for col in X_test.columns:
    best_p = 1.0
    best_thresh = None
    values = X_test[col].values
    for thresh in np.unique(values):
        binarized = (values >= thresh).astype(int)
        contingency = pd.crosstab(binarized, y_test)
        if contingency.shape == (2, 2):  # Only test if both classes present
            _, p, _, _ = chi2_contingency(contingency)
            if p < best_p:
                best_p = p
                best_thresh = thresh
    thresholds[col] = best_thresh

for col, thresh in thresholds.items():
    print(f"{col}: threshold = {thresh}")

Pregnancies: threshold = 7
Glucose: threshold = 125
Blood Pressure: threshold = 84
Skin Thickness: threshold = 24
Insulin: threshold = 79
BMI: threshold = 31.1
Diabetes Pedigree Function: threshold = 0.403
Age: threshold = 30


In [None]:
for col in X_test.columns:
    thresh = thresholds[col]
    binarized = (X_test[col].values >= thresh).astype(int)
    print(f"{col}: shape = {binarized.shape}, counts = {np.bincount(binarized)}")

Pregnancies: shape = (79,), counts = [61 18]
Glucose: shape = (79,), counts = [48 31]
Blood Pressure: shape = (79,), counts = [62 17]
Skin Thickness: shape = (79,), counts = [22 57]
Insulin: shape = (79,), counts = [19 60]
BMI: shape = (79,), counts = [25 54]
Diabetes Pedigree Function: shape = (79,), counts = [38 41]
Age: shape = (79,), counts = [43 36]


In [None]:
class_priors = y_train.value_counts(normalize=True)
print(class_priors)

Outcome
0    0.690096
1    0.309904
Name: proportion, dtype: float64


In [None]:
# Compute likelihoods P(feature=1|class) and P(feature=0|class) for all features in X_test
likelihoods = {}

for col in X_test.columns:
    thresh = thresholds[col]
    binarized = (X_train[col].values >= thresh).astype(int)
    feature_likelihood = {}
    for cls in [0, 1]:
        cls_mask = (y_train == cls)
        total = cls_mask.sum()
        # Laplace smoothing to avoid zero probabilities
        p1 = (binarized[cls_mask].sum() + 1) / (total + 2)
        p0 = (total - binarized[cls_mask].sum() + 1) / (total + 2)
        feature_likelihood[cls] = {'P(feature=1|class)': p1, 'P(feature=0|class)': p0}
    likelihoods[col] = feature_likelihood

for col, probs in likelihoods.items():
    print(f"{col}: {probs}")

Pregnancies: {0: {'P(feature=1|class)': 0.10091743119266056, 'P(feature=0|class)': 0.8990825688073395}, 1: {'P(feature=1|class)': 0.29292929292929293, 'P(feature=0|class)': 0.7070707070707071}}
Glucose: {0: {'P(feature=1|class)': 0.29357798165137616, 'P(feature=0|class)': 0.7064220183486238}, 1: {'P(feature=1|class)': 0.7575757575757576, 'P(feature=0|class)': 0.24242424242424243}}
Blood Pressure: {0: {'P(feature=1|class)': 0.11926605504587157, 'P(feature=0|class)': 0.8807339449541285}, 1: {'P(feature=1|class)': 0.20202020202020202, 'P(feature=0|class)': 0.797979797979798}}
Skin Thickness: {0: {'P(feature=1|class)': 0.5963302752293578, 'P(feature=0|class)': 0.4036697247706422}, 1: {'P(feature=1|class)': 0.8080808080808081, 'P(feature=0|class)': 0.1919191919191919}}
Insulin: {0: {'P(feature=1|class)': 0.6422018348623854, 'P(feature=0|class)': 0.3577981651376147}, 1: {'P(feature=1|class)': 0.9292929292929293, 'P(feature=0|class)': 0.0707070707070707}}
BMI: {0: {'P(feature=1|class)': 0.5, 

In [None]:
sample = X_test.sample(n=1, random_state=42)
print(sample)

     Pregnancies  Glucose  Blood Pressure  Skin Thickness  Insulin   BMI  \
155            3      170              64              37      225  34.5   

     Diabetes Pedigree Function  Age  
155                       0.356   30  


In [None]:
# Print binarization thresholds
print("Binarization thresholds:")
for col, thresh in thresholds.items():
    print(f"{col}: {thresh}")

# Binarize the selected sample using the thresholds
sample_binarized = sample.copy()
for col in sample.columns:
    sample_binarized[col] = (sample[col] >= thresholds[col]).astype(int)
print("\nBinarized sample:")
print(sample_binarized)

Binarization thresholds:
Pregnancies: 7
Glucose: 125
Blood Pressure: 84
Skin Thickness: 24
Insulin: 79
BMI: 31.1
Diabetes Pedigree Function: 0.403
Age: 30

Binarized sample:
     Pregnancies  Glucose  Blood Pressure  Skin Thickness  Insulin  BMI  \
155            0        1               0               1        1    1   

     Diabetes Pedigree Function  Age  
155                           0    1  


In [None]:
import pandas as pd
from IPython.display import display

# Calculate posterior probabilities for the binarized sample, feature by feature
posteriors_table = []

for col in sample_binarized.columns:
    feature_val = sample_binarized.iloc[0][col]
    probs = likelihoods[col]
    row = {
        'Feature': col,
        'Value': feature_val,
        'P(feature=val|class=0)': probs[0][f'P(feature={feature_val}|class)'],
        'P(feature=val|class=1)': probs[1][f'P(feature={feature_val}|class)']
    }
    posteriors_table.append(row)

posteriors_df = pd.DataFrame(posteriors_table)
display(posteriors_df)

Unnamed: 0,Feature,Value,P(feature=val|class=0),P(feature=val|class=1)
0,Pregnancies,0,0.899083,0.707071
1,Glucose,1,0.293578,0.757576
2,Blood Pressure,0,0.880734,0.79798
3,Skin Thickness,1,0.59633,0.808081
4,Insulin,1,0.642202,0.929293
5,BMI,1,0.5,0.737374
6,Diabetes Pedigree Function,0,0.454128,0.353535
7,Age,1,0.284404,0.585859


In [None]:
# Calculate posterior probabilities for the binarized sample using Naive Bayes rule
# and classify based on the higher posterior

# Start with class priors
posteriors = {}
calculation_steps = []

for cls in [0, 1]:
    posterior = class_priors[cls]
    step_info = {"Class": cls, "Prior": f"{posterior:.4f}"}
    
    for col in sample_binarized.columns:
        feature_val = sample_binarized.iloc[0][col]
        prob = likelihoods[col][cls][f'P(feature={feature_val}|class)']
        step_info[col] = f"{prob:.4f}"
        posterior *= prob
    
    step_info["Product"] = f"{posterior:.6e}"
    posteriors[cls] = posterior
    calculation_steps.append(step_info)

# Normalize to get probabilities
total_post = sum(posteriors.values())
for cls in posteriors:
    posteriors[cls] /= total_post

# Add normalized probabilities to the table
for i, cls in enumerate([0, 1]):
    calculation_steps[i]["Normalized"] = f"{posteriors[cls]:.4f}"

# Classification
predicted_class = max(posteriors, key=posteriors.get)

# Get the actual outcome for comparison
actual_class = y_test.loc[sample.index[0]]

print("Posterior probabilities:", posteriors)
print("Predicted class:", predicted_class)
print("Actual class:", actual_class)
print("Prediction correct:", predicted_class == actual_class)

# Display calculation summary
print("\nNaive Bayes Calculation Summary:")
calc_df = pd.DataFrame(calculation_steps)
display(calc_df)

Posterior probabilities: {0: 0.20707311064811745, 1: 0.7929268893518825}
Predicted class: 1
Actual class: 1
Prediction correct: True

Naive Bayes Calculation Summary:


Unnamed: 0,Class,Prior,Pregnancies,Glucose,Blood Pressure,Skin Thickness,Insulin,BMI,Diabetes Pedigree Function,Age,Product,Normalized
0,0,0.6901,0.8991,0.2936,0.8807,0.5963,0.6422,0.5,0.4541,0.2844,0.003967524,0.2071
1,1,0.3099,0.7071,0.7576,0.798,0.8081,0.9293,0.7374,0.3535,0.5859,0.01519249,0.7929


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Predict for all samples in X_test
predictions = []
for idx, row in X_test.iterrows():
    binarized_row = (row >= pd.Series(thresholds)).astype(int)
    posteriors = {}
    for cls in [0, 1]:
        posterior = class_priors[cls]
        for col in X_test.columns:
            prob = likelihoods[col][cls][f'P(feature={binarized_row[col]}|class)']
            posterior *= prob
        posteriors[cls] = posterior
    # Normalize
    total_post = sum(posteriors.values())
    for cls in posteriors:
        posteriors[cls] /= total_post
    predicted = max(posteriors, key=posteriors.get)
    predictions.append(predicted)

# Convert predictions to Series with same index as y_test
predictions = pd.Series(predictions, index=y_test.index, name='Predicted')

# Confusion matrix and metrics
cm = confusion_matrix(y_test, predictions)
acc = accuracy_score(y_test, predictions)
prec = precision_score(y_test, predictions)
rec = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print("Confusion Matrix:\n", cm)
print(f"Accuracy: {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall: {rec:.4f}")
print(f"F1 Score: {f1:.4f}")

Confusion Matrix:
 [[39  7]
 [ 7 26]]
Accuracy: 0.8228
Precision: 0.7879
Recall: 0.7879
F1 Score: 0.7879


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import BernoulliNB

# Fit BernoulliNB on binarized training data
bnb = BernoulliNB()
bnb.fit((X_train >= pd.Series(thresholds)).astype(int), y_train)

# Predict on binarized test data
sklearn_predictions = bnb.predict(X_test_binarized)
sklearn_acc = (sklearn_predictions == y_test).mean()

# Compute metrics for scikit-learn BernoulliNB predictions
cm_sklearn = confusion_matrix(y_test, sklearn_predictions)
acc_sklearn = accuracy_score(y_test, sklearn_predictions)
prec_sklearn = precision_score(y_test, sklearn_predictions)
rec_sklearn = recall_score(y_test, sklearn_predictions)
f1_sklearn = f1_score(y_test, sklearn_predictions)

print("Confusion Matrix (scikit-learn):\n", cm_sklearn)
print(f"Accuracy: {acc_sklearn:.4f}")
print(f"Precision: {prec_sklearn:.4f}")
print(f"Recall: {rec_sklearn:.4f}")
print(f"F1 Score: {f1_sklearn:.4f}")

NameError: name 'X_test_binarized' is not defined

In [None]:
from sklearn.naive_bayes import GaussianNB

# Fit GaussianNB on the original (non-binarized) training data
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict on the original test data
gnb_predictions = gnb.predict(X_test)

# Compute metrics for GaussianNB predictions
cm_gnb = confusion_matrix(y_test, gnb_predictions)
acc_gnb = accuracy_score(y_test, gnb_predictions)
prec_gnb = precision_score(y_test, gnb_predictions)
rec_gnb = recall_score(y_test, gnb_predictions)
f1_gnb = f1_score(y_test, gnb_predictions)

print("Confusion Matrix (GaussianNB):\n", cm_gnb)
print(f"Accuracy: {acc_gnb:.4f}")
print(f"Precision: {prec_gnb:.4f}")
print(f"Recall: {rec_gnb:.4f}")
print(f"F1 Score: {f1_gnb:.4f}")

Confusion Matrix (GaussianNB):
 [[46  7]
 [ 9 17]]
Accuracy: 0.7975
Precision: 0.7083
Recall: 0.6538
F1 Score: 0.6800
