In [4]:
# naive_bayes_tuned
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.inspection import permutation_importance
import io

# Load data
df = pd.read_csv('breast-cancer-wisconsin_cleaned.csv', header=0)  # Use header row as column names
features = ['1', '2', '3', '4', '5', '6', '7', '8', '9']  # Features 1 through 9 as string indices
X = df[features]
y = df['10']  # Target (should be binary 0/1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Tuned model with var_smoothing from tuning
model = GaussianNB(var_smoothing=1e-9)
model.fit(X_train_scaled, y_train)

# Predict probabilities
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Set custom threshold (optimum based on best recall)
custom_threshold = 0.1
print(f"Using custom threshold: {custom_threshold}")

# Predict with custom threshold
y_pred = np.where(y_prob > custom_threshold, 1, 0)

# Metrics with custom threshold (rounded to 3 decimal places)
accuracy = round(accuracy_score(y_test, y_pred), 3)
f1 = round(f1_score(y_test, y_pred), 3)
roc_auc = round(roc_auc_score(y_test, y_prob), 3)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("ROC-AUC:", roc_auc)

# Confusion Matrix with custom threshold (tab-delimited for Word conversion)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_text = cm_df.to_csv(sep='\t', index=True, header=True, float_format='%.0f')
print("Confusion Matrix (Custom Threshold):\n", cm_text)

# Thresholding Table for reference (tab-delimited for Word conversion, rounded to 3 decimal places)
thresholds = np.arange(0.1, 1.0, 0.1)  # [0.1, 0.2, ..., 0.9]
threshold_results = []
best_f1 = 0
best_threshold = 0.5  # Default
for threshold in thresholds:
    y_pred_thresh = np.where(y_prob > threshold, 1, 0)
    acc_thresh = round(accuracy_score(y_test, y_pred_thresh), 3)
    precision_thresh = round(precision_score(y_test, y_pred_thresh, zero_division=0), 3)
    recall_thresh = round(recall_score(y_test, y_pred_thresh, zero_division=0), 3)
    f1_thresh = round(f1_score(y_test, y_pred_thresh), 3)
    roc_auc_thresh = round(roc_auc_score(y_test, y_prob), 3)
    threshold_results.append([round(threshold, 3), acc_thresh, precision_thresh, recall_thresh, f1_thresh, roc_auc_thresh])
    if f1_thresh > best_f1:
        best_f1 = f1_thresh
        best_threshold = threshold
threshold_df = pd.DataFrame(threshold_results, columns=['Threshold', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC'])
# Ensure consistent tab-delimited output with standardized formatting
threshold_text = threshold_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Thresholding Table (Standardized):\n", threshold_text)
print(f"Optimum Threshold (max F1): {best_threshold} with F1: {best_f1}")

# Permutation Feature Importance with F1 scoring (rounded to 3 decimal places)
perm_importance = permutation_importance(model, X_test_scaled, y_test, n_repeats=10, random_state=42, scoring='f1')
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance Mean': [round(val, 3) for val in perm_importance.importances_mean],
    'Importance Std': [round(val, 3) for val in perm_importance.importances_std]
})
importance_text = importance_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Permutation Feature Importance (F1):\n", importance_text)

# Log-Likelihood Ratio Feature Importance (rounded to 3 decimal places)
# Compute log-likelihood differences for each feature
theta_pos = model.theta_[1, :]  # Mean for positive class (class 1)
theta_neg = model.theta_[0, :]  # Mean for negative class (class 0)
var_pos = model.var_[1, :]      # Variance for positive class
var_neg = model.var_[0, :]      # Variance for negative class
# Log-likelihood ratio: difference in log probabilities
log_likelihood_diff = np.log((1 / np.sqrt(2 * np.pi * var_pos) * np.exp(-0.5 * ((X_test_scaled - theta_pos) ** 2 / var_pos))) /
                             (1 / np.sqrt(2 * np.pi * var_neg) * np.exp(-0.5 * ((X_test_scaled - theta_neg) ** 2 / var_neg))))
# Average absolute difference across samples
llr_importance = np.abs(log_likelihood_diff).mean(axis=0)
llr_importance_rounded = np.round(llr_importance, 3).tolist()
llr_df = pd.DataFrame({
    'Feature': features,
    'Log-Likelihood Ratio Importance': llr_importance_rounded
})
llr_df = llr_df.sort_values('Log-Likelihood Ratio Importance', ascending=False)
llr_text = llr_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Log-Likelihood Ratio Feature Importance:\n", llr_text)

Using custom threshold: 0.1
Accuracy: 0.966
F1 Score: 0.949
ROC-AUC: 0.992
Confusion Matrix (Custom Threshold):
 	Predicted 0	Predicted 1
Actual 0	113	5
Actual 1	1	56

Thresholding Table (Standardized):
 Threshold	Accuracy	Precision	Recall	F1	ROC-AUC
0.100	0.966	0.918	0.982	0.949	0.992
0.200	0.960	0.917	0.965	0.940	0.992
0.300	0.960	0.917	0.965	0.940	0.992
0.400	0.960	0.917	0.965	0.940	0.992
0.500	0.960	0.917	0.965	0.940	0.992
0.600	0.960	0.917	0.965	0.940	0.992
0.700	0.960	0.917	0.965	0.940	0.992
0.800	0.960	0.917	0.965	0.940	0.992
0.900	0.960	0.917	0.965	0.940	0.992

Optimum Threshold (max F1): 0.1 with F1: 0.949
Permutation Feature Importance (F1):
 Feature	Importance Mean	Importance Std
1	-0.000	0.004
2	0.119	0.013
3	0.097	0.019
4	0.085	0.014
5	0.041	0.010
6	0.133	0.010
7	0.021	0.012
8	0.141	0.016
9	0.040	0.010

Log-Likelihood Ratio Feature Importance:
 Feature	Log-Likelihood Ratio Importance
2	8.341
8	7.499
6	7.020
3	6.598
4	5.258
9	5.064
5	3.992
7	3.616
1	2.403

