In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

import numpy as np
np.random.seed(13)
class MultinomialNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_probs = {}
        self.feature_probs = {}

        # Calculate class probabilities
        for cls in self.classes:
            # Calculate prior probability of each class
            self.class_probs[cls] = np.sum(y == cls) / len(y)

            # Extract data for the current class
            X_cls = X[y == cls]
            
            # Calculate feature probabilities for each class
            self.feature_probs[cls] = {}
            for feature in range(X.shape[1]):
                feature_values = X_cls[:, feature]
                total_count = np.sum(feature_values)
                self.feature_probs[cls][feature] = {
                    'prob': (feature_values.sum() + 1) / (total_count + len(np.unique(X[:, feature]))),
                    'count': total_count
                }

    def predict(self, X):
        predictions = []
        for instance in X:
            max_prob = -1
            prediction = None
            for cls in self.classes:
                prob = np.log(self.class_probs[cls])
                for feature, value in enumerate(instance):
                    if value > 0:  # Considering only non-zero features
                        prob += np.log(self.feature_probs[cls][feature]['prob'])

                if prob > max_prob or max_prob == -1:
                    max_prob = prob
                    prediction = cls

            predictions.append(prediction)
        return predictions


# Load the dataset
data = pd.read_csv('./diabetes_prediction_dataset.csv')
# np.random.shuffle(data)
data = pd.get_dummies(data, columns=['gender','smoking_history'])

# Separate features and target variable
X = data.drop('diabetes', axis=1)  # Features
y = data['diabetes']  # Target variable

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize Gaussian Naïve Bayes classifier
gnb = MultinomialNB()

# Train the classifier using the training data
gnb.fit(X_train, y_train)

# Make predictions on the test data
y_pred = gnb.predict(X_test)

# Calculate accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.9006666666666666


In [3]:
data.columns

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [2]:
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd

import numpy as np

def nb_class(train_feat, train_lab, val_feat):
    n_train_samples, n_feat = train_feat.shape
    n_classes = len(np.unique(train_lab))
    class_pr = {}
    naive_probs = {}

    # Calculate class probabilities
    for cl_lab in range(n_classes):
        samples_cl = train_feat[train_lab == cl_lab]
        class_pr[cl_lab] = len(samples_cl) / n_train_samples

        naive_probs[cl_lab] = {}
        for feat_idx in range(n_feat):
            feat_values = samples_cl[:, feat_idx]
            unique_vals, counts = np.unique(feat_values, return_counts=True)
            total_vals = len(feat_values)
            probs = (counts + 1) / (total_vals + len(unique_vals))
            naive_probs[cl_lab][feat_idx] = dict(zip(unique_vals, probs))

    predictions = []
    # Make predictions for each sample in the validation set
    for sample in val_feat:
        class_scores = []
        for cl_lab in range(n_classes):
            log_likelihood = np.sum([np.log(naive_probs[cl_lab][feat].get(value, 1e-10))
                                     for feat, value in enumerate(sample)])
            class_scores.append(np.log(class_pr[cl_lab]) + log_likelihood)

        # Choose the class with the highest score as the predicted class
        predictions.append(np.argmax(class_scores))

    return np.array(predictions)


def evaluate(true_lab, pred_lab):
    def comp_acc(actual_lab, pred_lab):
        num_correct = sum(1 for i in range(len(actual_lab)) if actual_lab[i] == pred_lab[i])
        return num_correct / float(len(actual_lab)) * 100.0

    acc = comp_acc(true_lab, pred_lab)
    conf_matrix = confusion_matrix(true_lab, pred_lab)
    
    return acc, conf_matrix


data = np.genfromtxt('./diabetes_prediction_dataset.csv', delimiter=',')
np.random.seed(13)
np.random.shuffle(data)

n_samples = len(data)
train_size = int(np.ceil(2/3 * n_samples))

train_data = data[:train_size]
val_data = data[train_size:]

X_train = train_data[:, :-2]
Y_train = train_data[:, -1]
X_valid = val_data[:, :-2]
Y_valid = val_data[:, -1]

means = np.nanmean(X_train, axis=0)
X_train_bin = (X_train > means).astype(int)
X_valid_bin = (X_valid > means).astype(int)

Y_valid_pred = nb_class(X_train_bin, Y_train.astype(int), X_valid_bin)

accuracy, confusion = evaluate(Y_valid, Y_valid_pred)

print(f'Validation Accuracy: {accuracy:.4f}')
print('Confusion Matrix:')
print(confusion)


  means = np.nanmean(X_train, axis=0)
  return x.astype(dtype, copy=copy, casting=casting)


ValueError: Input y_true contains NaN.

In [20]:
from sklearn.metrics import classification_report
print(classification_report(Y_valid, Y_valid_pred,zero_division=1))


              precision    recall  f1-score   support

 -2147483648       1.00      0.00      0.00         1
           0       0.94      0.97      0.95     30455
           1       0.45      0.29      0.36      2877

    accuracy                           0.91     33333
   macro avg       0.80      0.42      0.44     33333
weighted avg       0.89      0.91      0.90     33333



In [32]:
print(confusion_matrix(Y_valid, Y_valid_pred))


[[    0     1     0]
 [    0 29443  1012]
 [    0  2037   840]]


In [39]:
data = pd.read_csv('./diabetes_prediction_dataset.csv')
data['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

In [40]:
pd.DataFrame(Y_valid).value_counts()

 0             30455
 1              2877
-2147483648        1
Name: count, dtype: int64