In [None]:
# logistic_regression_tuned
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.inspection import permutation_importance
import statsmodels.api as sm
import io

# Load data
df = pd.read_csv('breast-cancer-wisconsin_cleaned.csv', header=0)  # Use header row as column names
features = ['1', '2', '3', '4', '5', '6', '7', '8', '9']  # Features 1 through 9 as string indices
X = df[features]
y = df['10']  # Target (should be binary 0/1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Used tuned model with best parameters and L1 regularization.
model = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', max_iter=100, class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

# Statsmodels summary
X_train_sm = sm.add_constant(X_train_scaled)  # Add intercept
logit_model = sm.Logit(y_train, X_train_sm).fit()
print(logit_model.summary())

# Predict probabilities
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Set custom threshold (optimum based on best recall)
custom_threshold = 0.1
print(f"Using custom threshold: {custom_threshold}")

# Predict with custom threshold
y_pred = np.where(y_prob > custom_threshold, 1, 0)

# Metrics with custom threshold (rounded to 3 decimal places)
accuracy = round(accuracy_score(y_test, y_pred), 3)
f1 = round(f1_score(y_test, y_pred), 3)
roc_auc = round(roc_auc_score(y_test, y_prob), 3)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("ROC-AUC:", roc_auc)

# Confusion Matrix with custom threshold (tab-delimited for Word conversion)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_text = cm_df.to_csv(sep='\t', index=True, header=True, float_format='%.0f')
print("Confusion Matrix (Custom Threshold 0.1):\n", cm_text)

# Thresholding Table for reference (tab-delimited for Word conversion, rounded to 3 decimal places)
thresholds = np.arange(0.1, 1.0, 0.1)  # [0.1, 0.2, ..., 0.9]
threshold_results = []
best_f1 = 0
best_threshold = 0.5  # Default
for threshold in thresholds:
    y_pred_thresh = np.where(y_prob > threshold, 1, 0)
    acc_thresh = round(accuracy_score(y_test, y_pred_thresh), 3)
    precision_thresh = round(precision_score(y_test, y_pred_thresh, zero_division=0), 3)
    recall_thresh = round(recall_score(y_test, y_pred_thresh, zero_division=0), 3)
    f1_thresh = round(f1_score(y_test, y_pred_thresh), 3)
    roc_auc_thresh = round(roc_auc_score(y_test, y_prob), 3)
    threshold_results.append([round(threshold, 3), acc_thresh, precision_thresh, recall_thresh, f1_thresh, roc_auc_thresh])
    if f1_thresh > best_f1:
        best_f1 = f1_thresh
        best_threshold = threshold
threshold_df = pd.DataFrame(threshold_results, columns=['Threshold', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC'])
# Ensure consistent tab-delimited output with standardized formatting
threshold_text = threshold_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Thresholding Table (Standardized):\n", threshold_text)
print(f"Optimum Threshold (max F1): {best_threshold} with F1: {best_f1}")

# Permutation Feature Importance with F1 scoring (rounded to 3 decimal places)
perm_importance = permutation_importance(model, X_test_scaled, y_test, n_repeats=10, random_state=42, scoring='f1')

# Combined Table (tab-delimited for Word conversion, rounded to 3 decimal places)
coefficients = logit_model.params[1:]  # Exclude intercept
p_values = logit_model.pvalues[1:]  # Exclude intercept
combined_df = pd.DataFrame({
    'Feature': features,
    'Coefficient': [round(val, 3) for val in coefficients],
    'P-Value': [round(val, 3) for val in p_values],
    'Importance Mean': [round(val, 3) for val in perm_importance.importances_mean],
    'Importance Std': [round(val, 3) for val in perm_importance.importances_std]
})
combined_text = combined_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Combined Coefficients, P-Values, and Permutation Importance:\n", combined_text)

Optimization terminated successfully.
         Current function value: 0.085135
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                     10   No. Observations:                  524
Model:                          Logit   Df Residuals:                      514
Method:                           MLE   Df Model:                            9
Date:                Sat, 09 Aug 2025   Pseudo R-squ.:                  0.8686
Time:                        00:00:17   Log-Likelihood:                -44.611
converged:                       True   LL-Null:                       -339.63
Covariance Type:            nonrobust   LLR p-value:                2.871e-121
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.2974      0.355     -3.655      0.000      -1.993      -0.602
x1             1.4203      0.

In [15]:
df[10]

0      10
1       0
2       0
3       0
4       0
       ..
695     0
696     0
697     1
698     1
699     1
Name: 10, Length: 700, dtype: int64