In [None]:
# Ch04-4 - Using Decision Trees to Explore Breast Cancer data

In [None]:
# Install the Seaborn library for graphing
%pip install seaborn

In [None]:
# Import necessary libraries
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    classification_report, 
    precision_score, 
    recall_score, 
    f1_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Load the breast cancer dataset
data = load_breast_cancer()
X, y = data.data, data.target

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# Create and train the decision tree classifier
dt_classifier = DecisionTreeClassifier(
    random_state=42,  # For reproducibility
    max_depth=5,      # Limit tree depth to prevent overfitting
    criterion='gini'  # Can also use 'entropy'
)
dt_classifier.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = dt_classifier.predict(X_test)

In [None]:
# Performance metrics
print("Decision Tree Performance Metrics:")
print("-" * 30)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

In [None]:
# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, 
                            target_names=data.target_names))

In [None]:
# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=data.target_names,
            yticklabels=data.target_names)
plt.title('Confusion Matrix for Decision Tree')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance Visualization
plt.figure(figsize=(10, 6))
feature_importance = dt_classifier.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, [data.feature_names[i] for i in sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Decision Tree Feature Importance')
plt.tight_layout()
plt.show()

In [None]:
# Visualize the Decision Tree
plt.figure(figsize=(20,10))
plot_tree(dt_classifier, 
          feature_names=data.feature_names,
          class_names=data.target_names,
          filled=True, 
          rounded=True)
plt.title('Decision Tree Classifier')
plt.show()

In [None]:
# Optional: Cross-validation for more robust performance estimation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(dt_classifier, X, y, cv=5)
print("\nCross-Validation Scores:")
print(f"Mean CV Score: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")

In [None]:
## End of Notebook ##