In [None]:
# -*- coding: utf-8 -*-
"""CS214 Lab 5: KNN, Bayes, and Naive Bayes Classifier.ipynb

Automatically generated by Colaboratory.

Original file is located at
    [Replace with your Colab link if applicable]

## Indian Institute of Technology Dharwad
## CS214: Artificial Intelligence Laboratory

## Lab 5: K-Nearest Neighbors, Bayes, and Naive Bayes Classifier

This notebook implements the tasks specified in Lab 5, focusing on KNN, Bayes, and Naive Bayes classifiers applied to the Wisconsin Diagnostic Breast Cancer (WDBC) dataset.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from scipy.stats import multivariate_normal
import warnings
warnings.filterwarnings('ignore')


"""## Part A: K-Nearest Neighbors (KNN) classification on original data"""

# 1. Load train, validation, and test data from WDBC_Train.csv, WDBC_Validation.csv, and WDBC_Test.csv, respectively.
train_df = pd.read_csv("WDBC_Train.csv")
valid_df = pd.read_csv("WDBC_Validation.csv")
test_df = pd.read_csv("WDBC_Test.csv")

X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Implement KNN classification with K=7.
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

# 3. Evaluate and compare the performance metrics on validation data and test data.
# Validation Data
y_valid_pred = knn.predict(X_valid)
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("KNN (K=7) - Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
y_test_pred = knn.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nKNN (K=7) - Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part B: Implementation of Bayes classifier (Gaussian Distribution) on original data"""

# 1. Load train, validation, and test data from WDBC_Train.csv, WDBC_Validation.csv, and WDBC_Test.csv, respectively.
train_df = pd.read_csv("WDBC_Train.csv")
valid_df = pd.read_csv("WDBC_Validation.csv")
test_df = pd.read_csv("WDBC_Test.csv")

# 2. Prepare the input feature vectors and target/class vectors.
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 3. Separate the training features vectors into two classes (Benign = 0, Malignant = 1).
X_train_benign = X_train[y_train == 0]
X_train_malignant = X_train[y_train == 1]

# 4. Compute the mean vector and covariance matrix for each class. Also, compute the prior probability for each class.
mean_benign = X_train_benign.mean()
cov_benign = X_train_benign.cov()
mean_malignant = X_train_malignant.mean()
cov_malignant = X_train_malignant.cov()

prior_benign = len(X_train_benign) / len(X_train)
prior_malignant = len(X_train_malignant) / len(X_train)

# 5. Compute the likelihood for each class for each tuple in the validation and test data.
def calculate_likelihood(X, mean, cov):
    try:
        return multivariate_normal.pdf(X, mean=mean, cov=cov, allow_singular=True)
    except Exception as e:
        print(f"Error calculating likelihood: {e}")
        return np.zeros(len(X))

likelihood_valid_benign = calculate_likelihood(X_valid, mean_benign, cov_benign)
likelihood_valid_malignant = calculate_likelihood(X_valid, mean_malignant, cov_malignant)
likelihood_test_benign = calculate_likelihood(X_test, mean_benign, cov_benign)
likelihood_test_malignant = calculate_likelihood(X_test, mean_malignant, cov_malignant)

# 6. Compute posterior probability.
def calculate_posterior(likelihood, prior):
    return likelihood * prior

posterior_valid_benign = calculate_posterior(likelihood_valid_benign, prior_benign)
posterior_valid_malignant = calculate_posterior(likelihood_valid_malignant, prior_malignant)
posterior_test_benign = calculate_posterior(likelihood_test_benign, prior_benign)
posterior_test_malignant = calculate_posterior(likelihood_test_malignant, prior_malignant)

# 7. Assign a class label with respect to the highest posterior probability.
y_valid_pred = (posterior_valid_malignant > posterior_valid_benign).astype(int)
y_test_pred = (posterior_test_malignant > posterior_test_benign).astype(int)

# 8. Evaluate performance metrics.
# Validation Data
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nBayes Classifier - Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nBayes Classifier - Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part C: Implementation of Naive Bayes classifier (Gaussian Distribution) on original data"""

# 1. Load train, validation, and test data from WDBC_Train.csv, WDBC_Validation.csv, and WDBC_Test.csv, respectively.
train_df = pd.read_csv("WDBC_Train.csv")
valid_df = pd.read_csv("WDBC_Validation.csv")
test_df = pd.read_csv("WDBC_Test.csv")

# 2. Prepare the input feature vectors and target/class vectors.
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 3. Implement the Naive Bayes classifier using GaussianNB() from Scikit-learn.
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

# 4. Predict the class levels of validation and test data.
y_valid_pred = naive_bayes.predict(X_valid)
y_test_pred = naive_bayes.predict(X_test)

# 5. Evaluate performance metrics.
# Validation Data
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nNaive Bayes Classifier - Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nNaive Bayes Classifier - Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part D: K-Nearest Neighbors (KNN) classification on standardized data"""

# 1. Load train, validation, and test data from WDBC_Scaled_Train.csv, WDBC_Scaled_Validation.csv, and WDBC_Scaled_Test.csv, respectively.
train_df = pd.read_csv("WDBC_Scaled_Train.csv")
valid_df = pd.read_csv("WDBC_Scaled_Validation.csv")
test_df = pd.read_csv("WDBC_Scaled_Test.csv")

X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Implement KNN classification with K=1.
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)

# 3. Evaluate and compare the performance metrics on validation data and test data.
# Validation Data
y_valid_pred = knn.predict(X_valid)
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nKNN (K=1) - Standardized Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
y_test_pred = knn.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nKNN (K=1) - Standardized Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part E: K-Nearest Neighbors (KNN) classification on PCA transformed (l=2) data"""

# 1. Load train, validation, and test data from WDBC_PCA2_Train.csv, WDBC_PCA2_Validation.csv, and WDBC_PCA2_Test.csv, respectively.
train_df = pd.read_csv("WDBC_PCA2_Train.csv")
valid_df = pd.read_csv("WDBC_PCA2_Validation.csv")
test_df = pd.read_csv("WDBC_PCA2_Test.csv")

X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Implement KNN classification with K=11.
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)

# 3. Evaluate and compare the performance metrics on validation data and test data.
# Validation Data
y_valid_pred = knn.predict(X_valid)
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nKNN (K=11) - PCA (l=2) Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
y_test_pred = knn.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nKNN (K=11) - PCA (l=2) Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part F: Implementation of Bayes classifier (Gaussian Distribution) on PCA transformed (l=2) data"""

# 1. Load train, validation, and test data from WDBC_PCA2_Train.csv, WDBC_PCA2_Validation.csv, and WDBC_PCA2_Test.csv, respectively.
train_df = pd.read_csv("WDBC_PCA2_Train.csv")
valid_df = pd.read_csv("WDBC_PCA2_Validation.csv")
test_df = pd.read_csv("WDBC_PCA2_Test.csv")

# 2. Prepare the input feature vectors and target/class vectors.
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 3. Separate the training features vectors into two classes (Benign = 0, Malignant = 1).
X_train_benign = X_train[y_train == 0]
X_train_malignant = X_train[y_train == 1]

# 4. Compute the mean vector and covariance matrix for each class. Also, compute the prior probability for each class.
mean_benign = X_train_benign.mean()
cov_benign = X_train_benign.cov()
mean_malignant = X_train_malignant.mean()
cov_malignant = X_train_malignant.cov()

prior_benign = len(X_train_benign) / len(X_train)
prior_malignant = len(X_train_malignant) / len(X_train)

# 5. Compute the likelihood for each class for each tuple in the validation and test data.
def calculate_likelihood(X, mean, cov):
    try:
        return multivariate_normal.pdf(X, mean=mean, cov=cov, allow_singular=True)
    except Exception as e:
        print(f"Error calculating likelihood: {e}")
        return np.zeros(len(X))

likelihood_valid_benign = calculate_likelihood(X_valid, mean_benign, cov_benign)
likelihood_valid_malignant = calculate_likelihood(X_valid, mean_malignant, cov_malignant)
likelihood_test_benign = calculate_likelihood(X_test, mean_benign, cov_benign)
likelihood_test_malignant = calculate_likelihood(X_test, mean_malignant, cov_malignant)

# 6. Compute posterior probability.
def calculate_posterior(likelihood, prior):
    return likelihood * prior

posterior_valid_benign = calculate_posterior(likelihood_valid_benign, prior_benign)
posterior_valid_malignant = calculate_posterior(likelihood_valid_malignant, prior_malignant)
posterior_test_benign = calculate_posterior(likelihood_test_benign, prior_benign)
posterior_test_malignant = calculate_posterior(likelihood_test_malignant, prior_malignant)

# 7. Assign a class label with respect to the highest posterior probability.
y_valid_pred = (posterior_valid_malignant > posterior_valid_benign).astype(int)
y_test_pred = (posterior_test_malignant > posterior_test_benign).astype(int)

# 8. Evaluate performance metrics.
# Validation Data
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nBayes Classifier - PCA (l=2) Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nBayes Classifier - PCA (l=2) Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part G: Implementation of Naive Bayes classifier (Gaussian Distribution) on PCA transformed (l=2) data"""

# 1. Load train, validation, and test data from WDBC_PCA2_Train.csv, WDBC_PCA2_Validation.csv, and WDBC_PCA2_Test.csv, respectively.
train_df = pd.read_csv("WDBC_PCA2_Train.csv")
valid_df = pd.read_csv("WDBC_PCA2_Validation.csv")
test_df = pd.read_csv("WDBC_PCA2_Test.csv")

# 2. Prepare the input feature vectors and target/class vectors.
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 3. Implement the Naive Bayes classifier using GaussianNB() from Scikit-learn.
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

# 4. Predict the class levels of validation and test data.
y_valid_pred = naive_bayes.predict(X_valid)
y_test_pred = naive_bayes.predict(X_test)

# 5. Evaluate performance metrics.
# Validation Data
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nNaive Bayes Classifier - PCA (l=2) Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nNaive Bayes Classifier - PCA (l=2) Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part H: K-Nearest Neighbors (KNN) classification on PCA transformed (l=10) data"""

# 1. Load train, validation, and test data from WDBC_PCA10_Train.csv, WDBC_PCA10_Validation.csv, and WDBC_PCA10_Test.csv, respectively.
train_df = pd.read_csv("WDBC_PCA10_Train.csv")
valid_df = pd.read_csv("WDBC_PCA10_Validation.csv")
test_df = pd.read_csv("WDBC_PCA10_Test.csv")

X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 2. Implement KNN classification with K=7.
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)

# 3. Evaluate and compare the performance metrics on validation data and test data.
# Validation Data
y_valid_pred = knn.predict(X_valid)
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nKNN (K=7) - PCA (l=10) Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
y_test_pred = knn.predict(X_test)
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nKNN (K=7) - PCA (l=10) Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part I: Implementation of Bayes classifier (Gaussian Distribution) on PCA transformed (l=10) data"""

# 1. Load train, validation, and test data from WDBC_PCA10_Train.csv, WDBC_PCA10_Validation.csv, and WDBC_PCA10_Test.csv, respectively.
train_df = pd.read_csv("WDBC_PCA10_Train.csv")
valid_df = pd.read_csv("WDBC_PCA10_Validation.csv")
test_df = pd.read_csv("WDBC_PCA10_Test.csv")

# 2. Prepare the input feature vectors and target/class vectors.
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 3. Separate the training features vectors into two classes (Benign = 0, Malignant = 1).
X_train_benign = X_train[y_train == 0]
X_train_malignant = X_train[y_train == 1]

# 4. Compute the mean vector and covariance matrix for each class. Also, compute the prior probability for each class.
mean_benign = X_train_benign.mean()
cov_benign = X_train_benign.cov()
mean_malignant = X_train_malignant.mean()
cov_malignant = X_train_malignant.cov()

prior_benign = len(X_train_benign) / len(X_train)
prior_malignant = len(X_train_malignant) / len(X_train)

# 5. Compute the likelihood for each class for each tuple in the validation and test data.
def calculate_likelihood(X, mean, cov):
    try:
        return multivariate_normal.pdf(X, mean=mean, cov=cov, allow_singular=True)
    except Exception as e:
        print(f"Error calculating likelihood: {e}")
        return np.zeros(len(X))

likelihood_valid_benign = calculate_likelihood(X_valid, mean_benign, cov_benign)
likelihood_valid_malignant = calculate_likelihood(X_valid, mean_malignant, cov_malignant)
likelihood_test_benign = calculate_likelihood(X_test, mean_benign, cov_benign)
likelihood_test_malignant = calculate_likelihood(X_test, mean_malignant, cov_malignant)

# 6. Compute posterior probability.
def calculate_posterior(likelihood, prior):
    return likelihood * prior

posterior_valid_benign = calculate_posterior(likelihood_valid_benign, prior_benign)
posterior_valid_malignant = calculate_posterior(likelihood_valid_malignant, prior_malignant)
posterior_test_benign = calculate_posterior(likelihood_test_benign, prior_benign)
posterior_test_malignant = calculate_posterior(likelihood_test_malignant, prior_malignant)

# 7. Assign a class label with respect to the highest posterior probability.
y_valid_pred = (posterior_valid_malignant > posterior_valid_benign).astype(int)
y_test_pred = (posterior_test_malignant > posterior_test_benign).astype(int)

# 8. Evaluate performance metrics.
# Validation Data
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nBayes Classifier - PCA (l=10) Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred)
recall_test = recall_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred)

print("\nBayes Classifier - PCA (l=10) Test Data:")
print("Confusion Matrix:\n", cm_test)
print("Accuracy:", accuracy_test)
print("Precision:", precision_test)
print("Recall:", recall_test)
print("F1-score:", f1_test)

"""## Part J: Implementation of Naive Bayes classifier (Gaussian Distribution) on PCA transformed (l=10) data"""

# 1. Load train, validation, and test data from WDBC_PCA10_Train.csv, WDBC_PCA10_Validation.csv, and WDBC_PCA10_Test.csv, respectively.
train_df = pd.read_csv("WDBC_PCA10_Train.csv")
valid_df = pd.read_csv("WDBC_PCA10_Validation.csv")
test_df = pd.read_csv("WDBC_PCA10_Test.csv")

# 2. Prepare the input feature vectors and target/class vectors.
X_train = train_df.drop('diagnosis', axis=1)
y_train = train_df['diagnosis']
X_valid = valid_df.drop('diagnosis', axis=1)
y_valid = valid_df['diagnosis']
X_test = test_df.drop('diagnosis', axis=1)
y_test = test_df['diagnosis']

# 3. Implement the Naive Bayes classifier using GaussianNB() from Scikit-learn.
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

# 4. Predict the class levels of validation and test data.
y_valid_pred = naive_bayes.predict(X_valid)
y_test_pred = naive_bayes.predict(X_test)

# 5. Evaluate performance metrics.
# Validation Data
cm_valid = confusion_matrix(y_valid, y_valid_pred)
accuracy_valid = accuracy_score(y_valid, y_valid_pred)
precision_valid = precision_score(y_valid, y_valid_pred)
recall_valid = recall_score(y_valid, y_valid_pred)
f1_valid = f1_score(y_valid, y_valid_pred)

print("\nNaive Bayes Classifier - PCA (l=10) Validation Data:")
print("Confusion Matrix:\n", cm_valid)
print("Accuracy:", accuracy_valid)
print("Precision:", precision_valid)
print("Recall:", recall_valid)
print("F1-score:", f1_valid)

# Test Data
cm_test = confusion_matrix(y_test, y_test_pred)
accuracy_test = accuracy_score(y_test, y_
