# Midterm Project — Decision Trees on Breast Cancer Diagnostic (WDBC)

**Author:** Palden Arya  
**Task:** Supervised classification with numeric features and a categorical target.  
**Dataset:** Breast Cancer Wisconsin (Diagnostic) — loaded from `sklearn.datasets` (UCI source).

This notebook reproduces the full workflow required by the assignment:
- Data description
- Cleaning & preprocessing
- Preliminary analysis & EDA (plots)
- Three decision tree models (low / high / medium via grid search)
- Over/underfitting comparison
- Ten-percent-worse model
- Evaluation metrics & confidence intervals
- Brief interpretation

> Note: Figures and outputs are saved into `./figures` and `./outputs` for easy download.

In [None]:
# If running in Google Colab, uncomment the next line to install dependencies:
# !pip install scikit-learn matplotlib pandas reportlab

In [None]:
import os, math, json, numpy as np, pandas as pd, matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, confusion_matrix

os.makedirs("figures", exist_ok=True)
os.makedirs("outputs", exist_ok=True)

plt.rcParams.update({"figure.figsize": (6,4)})

In [None]:
data_bunch = load_breast_cancer(as_frame=True)
X_full = data_bunch.data.copy()
y_full = data_bunch.target.copy()
feature_names = list(X_full.columns)
target_mapping = {0: "malignant", 1: "benign"}

df = X_full.copy()
df["target"] = y_full.map(target_mapping)
df["_target_int"] = y_full

print("Rows/Cols:", df.shape)
print("Classes:", df["target"].value_counts().to_dict())

In [None]:
# Type enforcement + missing checks; impute with median if needed
for col in feature_names:
    df[col] = pd.to_numeric(df[col], errors="coerce")
    if df[col].isna().any():
        df[col] = df[col].fillna(df[col].median())

missing_counts = df.isna().sum()
print("Missing values per column (should be 0):")
print(missing_counts[missing_counts>0] if (missing_counts>0).any() else "None")

In [None]:
describe_df = df[feature_names].describe().T
display(describe_df.head(12))

class_counts = df["target"].value_counts().rename_axis("class").reset_index(name="count")
class_counts["proportion"] = class_counts["count"] / class_counts["count"].sum()
display(class_counts)

In [None]:
def save_fig(path):
    plt.tight_layout()
    plt.savefig(path, bbox_inches="tight")
    plt.close()

# Class distribution
plt.figure()
plt.pie(class_counts["count"], labels=class_counts["class"], autopct="%1.1f%%")
plt.title("Class Distribution")
save_fig("figures/pie_class_distribution.png")

# Density of a key feature
plt.figure()
plt.hist(df["mean radius"], bins=30, density=True)
plt.xlabel("mean radius"); plt.ylabel("density"); plt.title("Density: mean radius")
save_fig("figures/density_mean_radius.png")

# Scatter matrix (subset)
subset_cols = ["mean radius", "mean texture", "mean perimeter", "mean area"]
pd.plotting.scatter_matrix(df[subset_cols], figsize=(8,8))
plt.suptitle("Scatter Matrix (subset)", y=1.02)
save_fig("figures/scatter_matrix_subset.png")

# Boxplots vs target
for col in subset_cols:
    plt.figure()
    data_mal = df[df["target"]=="malignant"][col].values
    data_ben = df[df["target"]=="benign"][col].values
    plt.boxplot([data_mal, data_ben], labels=["malignant","benign"])
    plt.title(f"Boxplot: {col} by target")
    plt.ylabel(col)
    save_fig(f"figures/boxplot_{col.replace(' ','_')}_by_target.png")

print("Saved figures to ./figures")

In [None]:
X = df[feature_names].values
y = df["_target_int"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [None]:
tree_high = DecisionTreeClassifier(random_state=42)
tree_high.fit(X_train, y_train)

tree_low = DecisionTreeClassifier(max_depth=1, min_samples_leaf=10, random_state=42)
tree_low.fit(X_train, y_train)

def eval_model(name, model):
    ytr = model.predict(X_train); yte = model.predict(X_test)
    return {
        "model": name,
        "train_acc": accuracy_score(y_train, ytr),
        "test_acc": accuracy_score(y_test, yte),
        "train_bal_acc": balanced_accuracy_score(y_train, ytr),
        "test_bal_acc": balanced_accuracy_score(y_test, yte),
    }

scores = [eval_model("high_unrestricted", tree_high), eval_model("low_complexity", tree_low)]
pd.DataFrame(scores)

In [None]:
param_grid = {
    "max_depth": [2, 3, 4, 5, None],
    "min_samples_split": [2, 4, 8],
    "min_samples_leaf": [1, 2, 4, 8],
    "criterion": ["gini", "entropy"]
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=1,
    refit=True,
    return_train_score=False
)
grid.fit(X_train, y_train)
best_tree = grid.best_estimator_
best_params = grid.best_params_
best_cv_score = grid.best_score_

scores.append(eval_model("medium_grid_best", best_tree))
scores_df = pd.DataFrame(scores)
display(scores_df)
print("Best params:", best_params)
print("CV mean accuracy:", best_cv_score)

In [None]:
A = scores_df.loc[scores_df['model']=='high_unrestricted','test_acc'].iloc[0]
threshold = A - 0.10

def node_count(dt):
    return dt.tree_.node_count

candidates = []
for d in [1, 2, 3, 4, 5, None]:
    for leaf in [1, 2, 4, 8, 16, 32]:
        m = DecisionTreeClassifier(max_depth=d, min_samples_leaf=leaf, random_state=42)
        m.fit(X_train, y_train)
        acc = accuracy_score(y_test, m.predict(X_test))
        if acc >= threshold:
            candidates.append({"d": d if d is not None else "None", "leaf": leaf, "nodes": node_count(m), "test_acc": acc, "model": m})

if candidates:
    candidates.sort(key=lambda c: (c["nodes"], -c["test_acc"]))
    chosen = candidates[0]
    ten_model = chosen["model"]
    ten_meta = {k:v for k,v in chosen.items() if k!="model"}
else:
    ten_model = DecisionTreeClassifier(max_depth=2, random_state=42).fit(X_train, y_train)
    ten_meta = {"d": 2, "leaf": 1, "nodes": node_count(ten_model), "test_acc": accuracy_score(y_test, ten_model.predict(X_test)), "note": "Fallback"}

def metrics(name, model):
    ypred = model.predict(X_test)
    cm = confusion_matrix(y_test, ypred)
    tn, fp, fn, tp = cm.ravel()
    return {
        "model": name,
        "accuracy": accuracy_score(y_test, ypred),
        "balanced_accuracy": balanced_accuracy_score(y_test, ypred),
        "precision": precision_score(y_test, ypred, zero_division=0),
        "recall": recall_score(y_test, ypred, zero_division=0),
        "false_negatives": int(fn),
        "confusion_matrix": cm.tolist()
    }

met_low  = metrics("low_complexity", tree_low)
met_med  = metrics("medium_grid_best", best_tree)
met_high = metrics("high_unrestricted", tree_high)
met_ten  = metrics("ten_percent_worse", ten_model)

summary = {
    "scores": scores_df.to_dict(orient="records"),
    "grid_best_params": best_params,
    "grid_best_cv_mean_accuracy": float(best_cv_score),
    "metrics": [met_low, met_med, met_high, met_ten],
    "ten_percent_worse_meta": ten_meta
}
with open("outputs/results_summary.json","w") as f:
    json.dump(summary, f, indent=2)

display(pd.DataFrame(summary["metrics"]))
print("10%-worse meta:", ten_meta)
print("Saved outputs/results_summary.json")

In [None]:
def summarize_depth1(model, feat_names, labels={0:'malignant',1:'benign'}):
    tr = model.tree_
    if tr.max_depth != 1:
        return {"note": "not depth=1"}
    fidx = tr.feature[0]
    thr = tr.threshold[0]
    left = np.argmax(tr.value[tr.children_left[0]][0])
    right = np.argmax(tr.value[tr.children_right[0]][0])
    return {
        "comparisons_made": 1,
        "if_rule": f"If {feat_names[fidx]} <= {thr:.4f} then class={labels[left]}",
        "else_rule": f"Else class={labels[right]}"
    }

low_rules = summarize_depth1(DecisionTreeClassifier(max_depth=1, min_samples_leaf=10, random_state=42).fit(X_train, y_train), feature_names)
low_rules

In [None]:
def bootstrap_ci_acc(model, Xte, yte, B=500, alpha=0.05, seed=0):
    rng = np.random.default_rng(seed)
    n = len(yte)
    preds = model.predict(Xte)
    accs = []
    for _ in range(B):
        idx = rng.integers(0, n, n)
        accs.append(accuracy_score(yte[idx], preds[idx]))
    return float(np.quantile(accs, alpha/2)), float(np.quantile(accs, 1-alpha/2))

ci_med = bootstrap_ci_acc(best_tree, X_test, y_test)
ci_ten = bootstrap_ci_acc(ten_model, X_test, y_test)

print("95% CI (accuracy) — medium:", ci_med)
print("95% CI (accuracy) — 10%-worse:", ci_ten)

In [None]:
# Optional: Create a compact PDF report (uncomment to run)
# from reportlab.lib.pagesizes import letter
# from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, TableStyle
# from reportlab.lib.styles import getSampleStyleSheet
# from reportlab.lib import colors
# import json, os
#
# with open("outputs/results_summary.json") as f:
#     res = json.load(f)
# pdf_path = "outputs/Midterm_Report_PaldenArya.pdf"
# doc = SimpleDocTemplate(pdf_path, pagesize=letter)
# styles = getSampleStyleSheet()
# elements = [Paragraph("Data Science Midterm Project — Palden Arya", styles["Title"]), Spacer(1,12)]
# # (Add tables/figures similar to the script we used earlier)
# doc.build(elements)
# print("Saved PDF:", pdf_path)