In [None]:
# random_forest_tuned
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score
from sklearn.inspection import permutation_importance
import io

# Load data
df = pd.read_csv('breast-cancer-wisconsin_cleaned.csv', header=0)  # Use header row as column names
features = ['1', '2', '3', '4', '5', '6', '7', '8', '9']  # Features 1 through 9 as string indices
X = df[features]
y = df['10']  # Target (should be binary 0/1)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Re-optimized model with best parameters
best_model = RandomForestClassifier(
    n_estimators=10,
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=False,
    class_weight='balanced',
    random_state=42
)
best_model.fit(X_train_scaled, y_train)

# Predict probabilities
y_prob = best_model.predict_proba(X_test_scaled)[:, 1]

# Set custom threshold (optimum based on best recall)
custom_threshold = 0.5 
print(f"Using custom threshold: {custom_threshold}")

# Predict with custom threshold.
y_pred = np.where(y_prob > custom_threshold, 1, 0)

# Metrics with custom threshold (rounded to 3 decimal places)
accuracy = round(accuracy_score(y_test, y_pred), 3)
f1 = round(f1_score(y_test, y_pred), 3)
roc_auc = round(roc_auc_score(y_test, y_prob), 3)
precision = round(precision_score(y_test, y_pred, zero_division=0), 3)
recall = round(recall_score(y_test, y_pred, zero_division=0), 3)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC-AUC:", roc_auc)

# Confusion Matrix with custom threshold (tab-delimited for Word conversion)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
cm_text = cm_df.to_csv(sep='\t', index=True, header=True, float_format='%.0f')
print("Confusion Matrix (Custom Threshold):\n", cm_text)

# Thresholding Table for reference (tab-delimited for Word conversion, rounded to 3 decimal places)
thresholds = np.arange(0.1, 1.0, 0.1)  # [0.1, 0.2, ..., 0.9]
threshold_results = []
best_f1 = 0
best_threshold = 0.5  # Default
for threshold in thresholds:
    y_pred_thresh = np.where(y_prob > threshold, 1, 0)
    acc_thresh = round(accuracy_score(y_test, y_pred_thresh), 3)
    precision_thresh = round(precision_score(y_test, y_pred_thresh, zero_division=0), 3)
    recall_thresh = round(recall_score(y_test, y_pred_thresh, zero_division=0), 3)
    f1_thresh = round(f1_score(y_test, y_pred_thresh), 3)
    roc_auc_thresh = round(roc_auc_score(y_test, y_prob), 3)
    threshold_results.append([round(threshold, 3), acc_thresh, precision_thresh, recall_thresh, f1_thresh, roc_auc_thresh])
    if f1_thresh > best_f1:
        best_f1 = f1_thresh
        best_threshold = threshold
threshold_df = pd.DataFrame(threshold_results, columns=['Threshold', 'Accuracy', 'Precision', 'Recall', 'F1', 'ROC-AUC'])
threshold_text = threshold_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Thresholding Table (Standardized):\n", threshold_text)
print(f"Optimum Threshold (max F1): {best_threshold} with F1: {best_f1}")

# Feature Importance from Random Forest (mean decrease in impurity, rounded to 3 decimal places)
rf_importance_df = pd.DataFrame({
    'Feature': features,
    'Feature Importance': [round(val, 3) for val in best_model.feature_importances_]
})
rf_importance_text = rf_importance_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Random Forest Feature Importance:\n", rf_importance_text)

# Permutation Feature Importance with F1 scoring (rounded to 3 decimal places)
perm_importance = permutation_importance(best_model, X_test_scaled, y_test, n_repeats=10, random_state=42, scoring='f1')
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance Mean': [round(val, 3) for val in perm_importance.importances_mean],
    'Importance Std': [round(val, 3) for val in perm_importance.importances_std]
})
importance_text = importance_df.to_csv(sep='\t', index=False, header=True, float_format='%.3f')
print("Permutation Feature Importance (F1):\n", importance_text)

Using custom threshold: 0.5
Accuracy: 0.971
Precision: 0.919
Recall: 1.0
F1 Score: 0.958
ROC-AUC: 0.993
Confusion Matrix (Custom Threshold):
 	Predicted 0	Predicted 1
Actual 0	113	5
Actual 1	0	57

Thresholding Table (Standardized):
 Threshold	Accuracy	Precision	Recall	F1	ROC-AUC
0.100	0.920	0.803	1.000	0.891	0.993
0.200	0.954	0.877	1.000	0.934	0.993
0.300	0.954	0.877	1.000	0.934	0.993
0.400	0.966	0.905	1.000	0.950	0.993
0.500	0.971	0.919	1.000	0.958	0.993
0.600	0.971	0.948	0.965	0.957	0.993
0.700	0.949	0.944	0.895	0.919	0.993
0.800	0.943	0.943	0.877	0.909	0.993
0.900	0.903	1.000	0.702	0.825	0.993

Optimum Threshold (max F1): 0.5 with F1: 0.958
Random Forest Feature Importance:
 Feature	Feature Importance
1	0.019
2	0.464
3	0.130
4	0.004
5	0.073
6	0.066
7	0.162
8	0.077
9	0.004

Permutation Feature Importance (F1):
 Feature	Importance Mean	Importance Std
1	0.002	0.004
2	0.030	0.006
3	0.045	0.013
4	0.000	0.000
5	0.013	0.009
6	0.204	0.016
7	0.004	0.008
8	0.021	0.009
9	0.002	0.003

