In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [12]:
data = pd.read_csv("Breast_cancer_data.csv")
data.head(10)

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0
5,12.45,15.7,82.57,477.1,0.1278,0
6,18.25,19.98,119.6,1040.0,0.09463,0
7,13.71,20.83,90.2,577.9,0.1189,0
8,13.0,21.82,87.5,519.8,0.1273,0
9,12.46,24.04,83.97,475.9,0.1186,0


In [13]:
data = data[["mean_radius", "mean_texture", "mean_smoothness", "diagnosis"]]
data.head(10)

Unnamed: 0,mean_radius,mean_texture,mean_smoothness,diagnosis
0,17.99,10.38,0.1184,0
1,20.57,17.77,0.08474,0
2,19.69,21.25,0.1096,0
3,11.42,20.38,0.1425,0
4,20.29,14.34,0.1003,0
5,12.45,15.7,0.1278,0
6,18.25,19.98,0.09463,0
7,13.71,20.83,0.1189,0
8,13.0,21.82,0.1273,0
9,12.46,24.04,0.1186,0


In [14]:
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

In [15]:
def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) *  np.exp(-((feat_val-mean)**2 / (2 * std**2 )))
    return p_x_given_y

In [16]:
def naive_bayes_gaussian(df, X, Y):

    features = list(df.columns)[:-1]


    prior = calculate_prior(df, Y)

    Y_pred = []

    for x in X:

        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])


        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred)

In [17]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=.2, random_state=41)

X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
Y_pred = naive_bayes_gaussian(train, X=X_test, Y="diagnosis")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))

[[36  4]
 [ 0 74]]
0.9736842105263158


In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score

sns.set_style("darkgrid")


X = np.array([[2, 3], [1, 1], [2, 2], [4, 4], [6, 6], [7, 8], [1, 3], [3, 3], [5, 5], [8, 8],
              [3, 7], [9, 3], [6, 2], [4, 5], [5, 7], [2, 6], [3, 6], [7, 9], [8, 7], [5, 6]])

y = np.array([1, -1, 1, 1, -1, 1, -1, 1, -1, 1,
              1, 1, -1, 1, -1, -1, 1, 1, 1, -1])


data = pd.DataFrame(X, columns=["feature_1", "feature_2"])
data["diagnosis"] = y



def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y] == i]) / len(df))
    return prior

def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    df = df[df[Y] == label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((feat_val - mean) ** 2 / (2 * std ** 2)))
    return p_x_given_y

def naive_bayes_gaussian(df, X, Y):
    features = list(df.columns)[:-1]

    prior = calculate_prior(df, Y)

    Y_pred = []
    for x in X:
        labels = sorted(list(df[Y].unique()))
        likelihood = [1] * len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])

        post_prob = [1] * len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))


    return np.array([1 if val == 1 else -1 for val in Y_pred])


train, test = train_test_split(data, test_size=0.2, random_state=41)


X_test = test.iloc[:, :-1].values
Y_test = test.iloc[:, -1].values


Y_pred = naive_bayes_gaussian(train, X=X_test, Y="diagnosis")


print("Confusion Matrix:")
print(confusion_matrix(Y_test, Y_pred))


print("\nF1 Score:", f1_score(Y_test, Y_pred, average='binary'))


Confusion Matrix:
[[1 1]
 [0 2]]

F1 Score: 0.8
