In [12]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Load and preprocess data
breast_cancer = fetch_ucirepo(id=15)
X = breast_cancer.data.features.replace('?', np.nan).apply(pd.to_numeric).dropna()
y = breast_cancer.data.targets.loc[X.index].replace({2:0, 4:1})

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Configure Random Forest
rf = RandomForestClassifier(n_estimators=10, max_depth=5, max_features='sqrt', oob_score=True, random_state=42)
rf.fit(X_train, y_train.values.ravel())

# Ensemble analysis
oob_score = rf.oob_score_
cv_scores = cross_val_score(rf, X_train, y_train.values.ravel(), cv=5)

# Predictions and evaluation
y_pred = rf.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

# Performance metrics
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred),
    'OOB Score': oob_score,
    'CV Mean Score': cv_scores.mean()
}

# Display results
print("Random Forest Performance:")
print(f"- Out-of-Bag Accuracy: {metrics['OOB Score']:.4f}")
print(f"- Cross-Validation Accuracy (5-fold): {metrics['CV Mean Score']:.4f}")
print(f"- Test Accuracy: {metrics['Accuracy']:.4f}")
print(f"- Precision: {metrics['Precision']:.3f} | Recall: {metrics['Recall']:.3f}")
print(f"- F1-Score: {metrics['F1-Score']:.3f}\n")

# Get feature importances and sort them
importances = rf.feature_importances_
sorted_idx = np.argsort(importances)[::-1] 

print("Top 5 Features:")
for i in sorted_idx[:5]:
    print(f"{X.columns[i]}: {importances[i]:.4f}")

pd.DataFrame(conf_matrix,
             index=['Actual Benign (TN/FP)', 'Actual Malignant (FN/TP)'],
             columns=['Predicted Benign', 'Predicted Malignant'])

  warn(


Random Forest Performance:
- Out-of-Bag Accuracy: 0.9570
- Cross-Validation Accuracy (5-fold): 0.9706
- Test Accuracy: 0.9649
- Precision: 0.922 | Recall: 0.983
- F1-Score: 0.952

Top 5 Features:
Uniformity_of_cell_shape: 0.3427
Bland_chromatin: 0.1566
Uniformity_of_cell_size: 0.1530
Bare_nuclei: 0.1197
Normal_nucleoli: 0.0805


Unnamed: 0,Predicted Benign,Predicted Malignant
Actual Benign (TN/FP),106,5
Actual Malignant (FN/TP),1,59
