In [2]:
# kernel_svm_tuned
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.inspection import permutation_importance
import shap
import io

# Load data
df = pd.read_csv('breast-cancer-wisconsin_cleaned.csv', header=0)  # Use header row as column names
features = ['1', '2', '3', '4', '5', '6', '7', '8', '9']  # Features 1 through 9 as string indices
X = df[features]
y = df['10']  # Target (should be binary 0/1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use tuned model with best parameters.
model = SVC(kernel='rbf', C=1.75, gamma='scale', probability=True, class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

# Predict probabilities
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Set custom threshold (optimum based on best recall)
custom_threshold = 0.1  # Can change to 0.2 to test
print(f"Using custom threshold: {custom_threshold}")

# Predict with custom threshold
y_pred = np.where(y_prob > custom_threshold, 1, 0)

# Metrics with custom threshold (rounded to 3 decimal places)
accuracy = round(accuracy_score(y_test, y_pred), 3)
f1 = round(f1_score(y_test, y_pred), 3)
roc_auc = round(roc_auc_score(y_test, y_prob), 3)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("ROC-AUC:", roc_auc)

# Confusion Matrix with custom threshold (tab-delimited for Word conversion)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_text = cm_df.to_csv(sep='\t', index=True, header=True, float_format='%.0f')
print("Confusion Matrix (Custom Threshold):\n", cm_text)

# Thresholding Table for reference (tab-delimited for Word conversion, rounded to 3 decimal places)
thresholds = np.arange(0.1, 1.0, 0.1)  # [0.1, 0.2, ..., 0.9]
threshold_results = []
best_f1 = 0
best_threshold = 0.5  # Default
for threshold in thresholds:
    y_pred_thresh = np.where(y_prob > threshold, 1, 0)
    acc_thresh = round(accuracy_score(y_test, y_pred_thresh), 3)
    precision_thresh = round(precision_score(y_test, y_pred_thresh, zero_division=0), 3)
    recall_thresh = round(recall_score(y_test, y_pred_thresh, zero_division=0), 3)
    f1_thresh = round(f1_score(y_test, y_pred_thresh), 3)
    roc_auc_thresh = round(roc_auc_score(y_test, y_prob), 3)
    threshold_results.append([round(threshold, 3), acc_thresh, precision_thresh, recall_thresh, f1_thresh, roc_auc_thresh])
    if f1_thresh > best_f1:
        best_f1 = f1_thresh
        best_threshold = threshold
threshold_df = pd.DataFrame(threshold_results, columns=['Threshold', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC'])
# Ensure consistent tab-delimited output with standardized formatting
threshold_text = threshold_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Thresholding Table (Standardized):\n", threshold_text)
print(f"Optimum Threshold (max F1): {best_threshold} with F1: {best_f1}")

# Permutation Feature Importance with F1 scoring (rounded to 3 decimal places)
perm_importance = permutation_importance(model, X_test_scaled, y_test, n_repeats=10, random_state=42, scoring='f1')
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance Mean': [round(val, 3) for val in perm_importance.importances_mean],
    'Importance Std': [round(val, 3) for val in perm_importance.importances_std]
})
importance_text = importance_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Permutation Feature Importance (F1):\n", importance_text)


# SHAP Feature Importance (mean absolute SHAP values, rounded to 3 decimal places)
explainer = shap.KernelExplainer(model.predict_proba, shap.kmeans(X_train_scaled, 100), link='logit')  # Summarize background
shap_values = explainer.shap_values(X_test_scaled, nsamples=100)  # Limit samples for speed
print(f"SHAP values shape: {np.array(shap_values).shape if isinstance(shap_values, list) else shap_values.shape}")  # Debug shape
if isinstance(shap_values, list) and len(shap_values) == 2:  # Binary classification
    shap_values = shap_values[1]  # Use SHAP values for positive class
elif shap_values.shape[1] != len(features):
    raise ValueError(f"SHAP values shape ({shap_values.shape}) does not match number of features ({len(features)})")
shap_importance = np.abs(shap_values).mean(axis=0)  # Mean absolute SHAP across samples
# Average across the last dimension (classes) if present
if shap_importance.shape[0] == len(features) and shap_importance.ndim > 1:
    shap_importance = np.mean(shap_importance, axis=1)
shap_importance_rounded = np.round(shap_importance, 3).tolist()  # Round and convert to list
if len(shap_importance_rounded) != len(features):
    raise ValueError(f"SHAP importance length ({len(shap_importance_rounded)}) does not match number of features ({len(features)})")
shap_df = pd.DataFrame({
    'Feature': features,
    'SHAP Importance': shap_importance_rounded
})
shap_df = shap_df.sort_values('SHAP Importance', ascending=False)
shap_text = shap_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("SHAP Feature Importance (Mean Absolute):\n", shap_text)

Using custom threshold: 0.1
Accuracy: 0.971
F1 Score: 0.958
ROC-AUC: 0.996
Confusion Matrix (Custom Threshold):
 	Predicted 0	Predicted 1
Actual 0	113	5
Actual 1	0	57

Thresholding Table (Standardized):
 Threshold	Accuracy	Precision	Recall	F1	ROC-AUC
0.100	0.971	0.919	1.000	0.958	0.996
0.200	0.971	0.933	0.982	0.957	0.996
0.300	0.971	0.933	0.982	0.957	0.996
0.400	0.960	0.931	0.947	0.939	0.996
0.500	0.960	0.946	0.930	0.938	0.996
0.600	0.966	0.964	0.930	0.946	0.996
0.700	0.966	0.964	0.930	0.946	0.996
0.800	0.949	0.962	0.877	0.917	0.996
0.900	0.920	0.978	0.772	0.863	0.996

Optimum Threshold (max F1): 0.1 with F1: 0.958
Permutation Feature Importance (F1):
 Feature	Importance Mean	Importance Std
1	0.031	0.015
2	-0.000	0.004
3	0.027	0.009
4	0.052	0.013
5	-0.005	0.004
6	0.134	0.014
7	0.003	0.014
8	0.008	0.006
9	0.003	0.005





  0%|          | 0/175 [00:00<?, ?it/s]

SHAP values shape: (175, 9, 2)
SHAP Feature Importance (Mean Absolute):
 Feature	SHAP Importance
6	1.094
3	0.754
1	0.749
4	0.534
7	0.401
8	0.329
2	0.246
5	0.199
9	0.191

