In [21]:
import pandas as pd
import numpy as np
from scipy.stats import multivariate_normal
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [None]:
train_df = pd.read_csv('Datasets/WDBC_Train.csv')
valid_df = pd.read_csv('Datasets/WDBC_Validation.csv')
test_df = pd.read_csv('Datasets/WDBC_Test.csv')

train_df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,0,11.36,17.57,72.49,399.8,0.08858,0.05313,0.02783,0.021,0.1601,...,13.05,36.32,85.07,521.3,0.1453,0.1622,0.1811,0.08698,0.2973,0.07745
1,1,21.61,22.28,144.4,1407.0,0.1167,0.2087,0.281,0.1562,0.2162,...,26.23,28.74,172.0,2081.0,0.1502,0.5717,0.7053,0.2422,0.3828,0.1007
2,0,13.15,15.34,85.31,538.9,0.09384,0.08498,0.09293,0.03483,0.1822,...,14.77,20.5,97.67,677.3,0.1478,0.2256,0.3009,0.09722,0.3849,0.08633
3,0,9.683,19.34,61.05,285.7,0.08491,0.0503,0.02337,0.009615,0.158,...,10.93,25.59,69.1,364.2,0.1199,0.09546,0.0935,0.03846,0.2552,0.0792
4,1,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115


In [23]:
X_train, y_train = train_df.iloc[:, 1:], train_df.iloc[:, 0]
X_valid, y_valid = valid_df.iloc[:, 1:], valid_df.iloc[:, 0]
X_test, y_test = test_df.iloc[:, 1:], test_df.iloc[:, 0]

print('Train shape: ', X_train.shape)
print('Validation shape: ', X_valid.shape)
print('Test shape: ', X_test.shape)

Train shape:  (341, 30)
Validation shape:  (114, 30)
Test shape:  (114, 30)


### Bayes Classifier

In [27]:
class_stats = {}
for c in np.unique(y_train):
    X_c = X_train[y_train == c]
    class_stats[c] = {
        'mean':np.mean(X_c,axis=0),
        'cov':np.cov(X_c,rowvar=False),
        'prior':len(X_c)/len(X_train)
    }

def bayes_predict(X):
    predictions = []
    for x in X:
        posteriors = {}
        likelihood = {}
        totalProb = 0
        for c,stats in class_stats.items():
            likelihood[c] = multivariate_normal(mean=stats['mean'], cov=stats['cov'], allow_singular=True).pdf(x)
            totalProb += likelihood[c]*stats['prior']
        for c,stats in class_stats.items():
            posteriors[c] = (likelihood[c]*stats['prior'])/totalProb
        predictions.append(max(posteriors,key=posteriors.get))
    return np.array(predictions)

y_valid_pred = bayes_predict(X_valid.values)
y_test_pred = bayes_predict(X_test.values)


print("---------Validation-----------")

cm = confusion_matrix(y_valid, y_valid_pred)
acc = accuracy_score(y_valid, y_valid_pred)
precision = precision_score(y_valid, y_valid_pred)
recall = recall_score(y_valid, y_valid_pred)
f1 = f1_score(y_valid, y_valid_pred)

print(f"Validation Set Accuracy: {acc:.4f}")
print("Confusion Matrix:")
print(cm)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}\n")


print("---------Test-----------")

cm1 = confusion_matrix(y_test_pred, y_test)
acc1 = accuracy_score(y_test_pred, y_test)
precision1 = precision_score(y_test_pred, y_test)
recall1 = recall_score(y_test_pred, y_test)
f11 = f1_score(y_test_pred, y_test)

print(f"Validation Set Accuracy: {acc1:.4f}")
print("Confusion Matrix:")
print(cm1)
print(f"Precision: {precision1:.4f}, Recall: {recall1:.4f}, F1-score: {f11:.4f}\n")

---------Validation-----------
Validation Set Accuracy: 0.9123
Confusion Matrix:
[[65  6]
 [ 4 39]]
Precision: 0.8667, Recall: 0.9070, F1-score: 0.8864

---------Test-----------
Validation Set Accuracy: 0.9035
Confusion Matrix:
[[64  3]
 [ 8 39]]
Precision: 0.9286, Recall: 0.8298, F1-score: 0.8764

