In [3]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

# Load and preprocess data
breast_cancer = fetch_ucirepo(id=15)
X = breast_cancer.data.features.replace('?', np.nan).apply(pd.to_numeric).dropna()
y = breast_cancer.data.targets.loc[X.index].replace({2:0, 4:1})

# Split data (no scaling needed for decision trees)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Train Decision Tree
dt = DecisionTreeClassifier(max_depth=5, min_samples_split=20, criterion='gini', random_state=42)
dt.fit(X_train, y_train.values.ravel())

# Feature importance analysis
feature_imp = pd.Series(dt.feature_importances_, index=X.columns)
top_features = feature_imp.sort_values(ascending=False).head(5)

# Predictions and evaluation
y_pred = dt.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)

# Performance metrics
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred),
    'Recall': recall_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred),
    'Top Features': top_features
}

# Display results
print("Decision Tree Performance:")
print(f"- Accuracy: {metrics['Accuracy']:.4f}")
print(f"- Precision: {metrics['Precision']:.3f} | Recall: {metrics['Recall']:.3f}")
print(f"- F1-Score: {metrics['F1-Score']:.3f}\n")

print("Top 5 Important Features:")
print(metrics['Top Features'].to_string())

pd.DataFrame(conf_matrix,
             index=['Actual Benign (TN/FP)', 'Actual Malignant (FN/TP)'],
             columns=['Predicted Benign', 'Predicted Malignant'])

Decision Tree Performance:
- Accuracy: 0.9532
- Precision: 0.882 | Recall: 1.000
- F1-Score: 0.938

Top 5 Important Features:
Uniformity_of_cell_shape    0.816252
Bare_nuclei                 0.123832
Uniformity_of_cell_size     0.044221
Marginal_adhesion           0.015695
Clump_thickness             0.000000


Unnamed: 0,Predicted Benign,Predicted Malignant
Actual Benign (TN/FP),103,8
Actual Malignant (FN/TP),0,60
