In [32]:
# linear_svm_tuned
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.calibration import CalibratedClassifierCV
import io

# Load data
df = pd.read_csv('breast-cancer-wisconsin_cleaned.csv', header=0)  # Use header row as column names
features = ['1', '2', '3', '4', '5', '6', '7', '8', '9']  # Features 1 through 9 as string indices
X = df[features]
y = df['10']  # Target (should be binary 0/1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use tuned model with best parameters and L2 regularization.
best_model = LinearSVC(C=0.5, max_iter=100, penalty='l2', dual=False, class_weight='balanced', random_state=42)
best_model.fit(X_train_scaled, y_train)

# RFE for feature importance.
rfe = RFE(estimator=best_model, n_features_to_select=1, step=1) 
rfe.fit(X_train_scaled, y_train)
rfe_ranking = pd.DataFrame({'Feature': features, 'Ranking': rfe.ranking_})
rfe_ranking = rfe_ranking.sort_values('Ranking')
rfe_text = rfe_ranking.to_csv(sep='\t', index=False)
print("RFE Feature Importance Ranking:\n", rfe_text)

# Predict probabilities (LinearSVC does not support probability=True, use CalibratedClassifierCV)
calibrated_model = CalibratedClassifierCV(best_model, method='sigmoid', cv='prefit')
calibrated_model.fit(X_train_scaled, y_train)
y_prob = calibrated_model.predict_proba(X_test_scaled)[:, 1]

# Set custom threshold (optimum based on best recall)
custom_threshold = 0.1
print(f"Using custom threshold: {custom_threshold}")

# Predict with custom threshold
y_pred = np.where(y_prob > custom_threshold, 1, 0)

# Metrics with custom threshold (rounded to 3 decimal places)
accuracy = round(accuracy_score(y_test, y_pred), 3)
f1 = round(f1_score(y_test, y_pred), 3)
roc_auc = round(roc_auc_score(y_test, y_prob), 3)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("ROC-AUC:", roc_auc)

# Confusion Matrix with custom threshold (tab-delimited for Word conversion)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_text = cm_df.to_csv(sep='\t', index=True, header=True, float_format='%.0f')
print("Confusion Matrix (Custom Threshold):\n", cm_text)

# Thresholding Table for reference (tab-delimited for Word conversion, rounded to 3 decimal places)
thresholds = np.arange(0.1, 1.0, 0.1)  # [0.1, 0.2, ..., 0.9]
threshold_results = []
best_f1 = 0
best_threshold = 0.5  # Default
for threshold in thresholds:
    y_pred_thresh = np.where(y_prob > threshold, 1, 0)
    acc_thresh = round(accuracy_score(y_test, y_pred_thresh), 3)
    precision_thresh = round(precision_score(y_test, y_pred_thresh, zero_division=0), 3)
    recall_thresh = round(recall_score(y_test, y_pred_thresh, zero_division=0), 3)
    f1_thresh = round(f1_score(y_test, y_pred_thresh), 3)
    roc_auc_thresh = round(roc_auc_score(y_test, y_prob), 3)
    threshold_results.append([round(threshold, 3), acc_thresh, precision_thresh, recall_thresh, f1_thresh, roc_auc_thresh])
    if f1_thresh > best_f1:
        best_f1 = f1_thresh
        best_threshold = threshold
threshold_df = pd.DataFrame(threshold_results, columns=['Threshold', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC'])
# Ensure consistent tab-delimited output with standardized formatting
threshold_text = threshold_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Thresholding Table (Standardized):\n", threshold_text)
print(f"Optimum Threshold (max F1): {best_threshold} with F1: {best_f1}")

# Permutation Feature Importance with F1 scoring (rounded to 3 decimal places)
perm_importance = permutation_importance(best_model, X_test_scaled, y_test, n_repeats=10, random_state=42, scoring='f1')
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance Mean': [round(val, 3) for val in perm_importance.importances_mean],
    'Importance Std': [round(val, 3) for val in perm_importance.importances_std]
})
importance_text = importance_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Permutation Feature Importance (F1):\n", importance_text)

RFE Feature Importance Ranking:
 Feature	Ranking
3	1
6	2
1	3
7	4
4	5
9	6
5	7
8	8
2	9

Using custom threshold: 0.1
Accuracy: 0.977
F1 Score: 0.966
ROC-AUC: 0.997
Confusion Matrix (Custom Threshold):
 	Predicted 0	Predicted 1
Actual 0	114	4
Actual 1	0	57

Thresholding Table (Standardized):
 Threshold	Accuracy	Precision	Recall	F1	ROC-AUC
0.100	0.977	0.934	1.000	0.966	0.997
0.200	0.977	0.949	0.982	0.966	0.997
0.300	0.960	0.946	0.930	0.938	0.997
0.400	0.966	0.964	0.930	0.946	0.997
0.500	0.966	0.981	0.912	0.945	0.997
0.600	0.954	0.980	0.877	0.926	0.997
0.700	0.949	0.980	0.860	0.916	0.997
0.800	0.937	0.979	0.825	0.895	0.997
0.900	0.920	1.000	0.754	0.860	0.997

Optimum Threshold (max F1): 0.1 with F1: 0.966




Permutation Feature Importance (F1):
 Feature	Importance Mean	Importance Std
1	0.047	0.018
2	0.000	0.000
3	0.022	0.015
4	0.008	0.006
5	0.001	0.003
6	0.065	0.024
7	0.015	0.009
8	0.000	0.000
9	0.005	0.004

