# Bayes Classifier for Breast Cancer Detection

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

- <b> Gaussian Naive Bayes Classifier From Scratch </b>

In [2]:
class GaussianNaiveBayesClassifier:
    def __init__(self, df):
        self.df = df
        self.n_classes = len(np.unique(df['diagnosis']))
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.set_gaussian_parameters()
        self.set_prior()
    
    def set_gaussian_parameters(self):
        self.mean = []
        self.std = []
        for i in range(self.n_classes):
            X = self.X_train[self.y_train == i]
            self.mean.append(np.mean(X, axis = 0))
            self.std.append(np.std(X, axis = 0))
    
    def set_prior(self):
        self.prior = []
        for i in range(self.n_classes):
            X = self.X_train[self.y_train == i]
            self.prior.append(X.shape[0] / len(self.X_train))
            
    def get_likelihood(self, x, i):
        exponent = np.exp((- 1 / 2) * (((x - self.mean[i]) / self.std[i]) ** 2))
        return (1 / ((np.sqrt(2 * np.pi)) * self.std[i])) * exponent
    
    def classify_sample(self, x):
        posterior = []
        for i in range(self.n_classes):
            posterior.append(np.prod(self.get_likelihood(x, i)) * self.prior[i])
        return posterior.index(max(posterior))

    def predict(self, X_test):
        return [self.classify_sample(x) for x in X_test]

In [3]:
df = pd.read_csv('./Data/Breast_cancer_data.csv')
X = df.drop(['diagnosis'], axis = 1).to_numpy()
y = df['diagnosis'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3106)

In [4]:
GNB = GaussianNaiveBayesClassifier(df)
GNB.fit(X_train, y_train)
y_predicted = GNB.predict(X_test)

In [5]:
print(metrics.confusion_matrix(y_test, y_predicted))

[[33 10]
 [ 1 70]]


In [6]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.97      0.77      0.86        43
           1       0.88      0.99      0.93        71

    accuracy                           0.90       114
   macro avg       0.92      0.88      0.89       114
weighted avg       0.91      0.90      0.90       114



- <b> Gaussian Naive Bayes Classifier Using `scikit-learn` Library </b>

In [7]:
GNB = GaussianNB()
GNB.fit(X_train, y_train)
y_predicted = GNB.predict(X_test)

In [8]:
print(metrics.confusion_matrix(y_test, y_predicted))

[[32 11]
 [ 1 70]]


In [9]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.97      0.74      0.84        43
           1       0.86      0.99      0.92        71

    accuracy                           0.89       114
   macro avg       0.92      0.87      0.88       114
weighted avg       0.90      0.89      0.89       114



- <b> Gaussian Optimal Bayes Classifier From Scratch </b>

In [10]:
class GaussianOptimalBayesClassifier:
    def __init__(self, df):
        self.df = df
        self.n_classes = len(np.unique(df['diagnosis']))
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.set_gaussian_parameters()
        self.set_prior()
    
    def set_gaussian_parameters(self):
        self.mean = []
        self.cov = []
        for i in range(self.n_classes):
            X = self.X_train[self.y_train == i]
            self.mean.append(np.mean(X, axis = 0))
            self.cov.append(np.cov(X.T))

    def set_prior(self):
        self.prior = []
        for i in range(self.n_classes):
            X = self.X_train[self.y_train == i]
            self.prior.append(X.shape[0] / len(self.X_train))
            
    def get_likelihood(self, x, i):
        exponent = np.exp((- 1 / 2) * ((x - self.mean[i]).T.dot(np.linalg.inv(self.cov[i]))).dot(x - self.mean[i]))
        return (1 / (((2 * np.pi) ** (len(self.mean[i] / 2))) * (np.linalg.det(self.cov[i]) ** (1 / 2)))) * exponent
    
    def classify_sample(self, x):
        posterior = []
        for i in range(self.n_classes):
            posterior.append(self.get_likelihood(x, i) * self.prior[i])
        return posterior.index(max(posterior))

    def predict(self, X_test):
        return [self.classify_sample(x) for x in X_test]

In [11]:
GNB = GaussianOptimalBayesClassifier(df)
GNB.fit(X_train, y_train)
y_predicted = GNB.predict(X_test)

In [12]:
print(metrics.confusion_matrix(y_test, y_predicted))

[[35  8]
 [ 2 69]]


In [13]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.95      0.81      0.88        43
           1       0.90      0.97      0.93        71

    accuracy                           0.91       114
   macro avg       0.92      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114

