In [1]:
import pandas as pd
import numpy as np
from scipy.stats import multivariate_normal
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [2]:
train = pd.read_csv('WDBC_PCA10_Train.csv')
valid = pd.read_csv('WDBC_PCA10_Validation.csv')
test = pd.read_csv('WDBC_PCA10_Test.csv')

In [3]:
X_train, y_train = train.iloc[:, 1:], train.iloc[:, 0]
X_valid, y_valid = valid.iloc[:, 1:], valid.iloc[:, 0]
X_test, y_test = test.iloc[:, 1:], test.iloc[:, 0]

In [4]:
X={}
X[0] = X_train[y_train == 0]
X[1] = X_train[y_train == 1]

In [5]:
# X_mean[0] = np.mean(X_0,axis=0)
# X_mean[1] = np.mean(X_1, axis=0)

# X_cov[0] = np.cov(X_0.T)
# X_cov[1] = np.cov(X_1.T)

# X_prior_0 = X_0.shape[0]/X_train.shape[0]
# X_prior_1 = X_1.shape[0]/X_train.shape[0]

In [6]:
classes = [0,1]
X_mean = {}
X_cov={}
X_prior={}
for c in classes:
    X_mean[c] = np.mean(X[c],axis=0)
    X_cov[c] = np.cov(X[c].T)
    X_prior[c] = X[c].shape[0]/X_train.shape[0]

In [7]:
def posterior_func(X):
    likelihood = {}
    post = []
    posterior =[]
    for c in classes:
        likelihood = multivariate_normal(mean=X_mean[c], cov=X_cov[c], allow_singular=True).pdf(X)
        # print(f"Likelihood for class {c}:\n {likelihood}")
        post.append(likelihood * X_prior[c])
    post = np.array(post)
    # print (post)
    posterior = post/np.sum(post, axis=0)
    # print(posterior)
    return np.argmax(posterior, axis=0)

y_test_pred= posterior_func(X_test)
print(f"y test predications:\n{y_test_pred}\n\n")

y_valid_pred = posterior_func(X_valid)
print(f"y valid predications:\n{y_valid_pred}\n\n")

y test predications:
[0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0
 1 1 1 0 1 1 0 0 1 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 1
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 1 0 0
 0 0 0]


y valid predications:
[0 0 1 0 1 1 0 1 1 1 0 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0
 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 1
 0 0 1]




In [8]:
def performance(model_name, y_true, y_pred):
    print(f"Performance for {model_name}:\n")
    print(f"Confusion matrix: \n{confusion_matrix(y_true,y_pred)}")
    print(f"Accuracy: \t{accuracy_score(y_true, y_pred)}")
    print(f"Precision: \t{precision_score(y_true, y_pred)}")
    print(f"Recall: \t{recall_score(y_true, y_pred)}")
    print(f"F1-score: \t{f1_score(y_true, y_pred)}\n\n")

performance("Validation data", y_valid, y_valid_pred)
performance("Test data", y_test, y_test_pred)

Performance for Validation data:

Confusion matrix: 
[[69  2]
 [ 2 41]]
Accuracy: 	0.9649122807017544
Precision: 	0.9534883720930233
Recall: 	0.9534883720930233
F1-score: 	0.9534883720930233


Performance for Test data:

Confusion matrix: 
[[70  2]
 [ 3 39]]
Accuracy: 	0.956140350877193
Precision: 	0.9512195121951219
Recall: 	0.9285714285714286
F1-score: 	0.9397590361445782


