In [2]:
# Day 7 - Decision Tree Classifier Example (Breast Cancer Dataset)

import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load dataset
data = load_breast_cancer()

# Convert to DataFrame for clarity
df = pd.DataFrame(data.data, columns=data.feature_names)
df["target"] = data.target

print(df.shape)
print(df.head())
print(df["target"].value_counts())


(569, 31)
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0   

In [3]:
X = df.drop("target", axis=1)  # all features
y = df["target"]               # label

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # important when classes are imbalanced
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (455, 30)
Test shape: (114, 30)


In [4]:
# Basic Decision Tree - NO constraints (will likely overfit)

dt_basic = DecisionTreeClassifier(
    random_state=42   # same tree every run
)

dt_basic.fit(X_train, y_train)

y_train_pred = dt_basic.predict(X_train)
y_test_pred = dt_basic.predict(X_test)

print("Train Accuracy (basic):", accuracy_score(y_train, y_train_pred))
print("Test Accuracy  (basic):", accuracy_score(y_test, y_test_pred))

print("\nClassification Report (Test - basic):")
print(classification_report(y_test, y_test_pred))

print("\nConfusion Matrix (Test - basic):")
print(confusion_matrix(y_test, y_test_pred))


Train Accuracy (basic): 1.0
Test Accuracy  (basic): 0.9122807017543859

Classification Report (Test - basic):
              precision    recall  f1-score   support

           0       0.85      0.93      0.89        42
           1       0.96      0.90      0.93        72

    accuracy                           0.91       114
   macro avg       0.90      0.92      0.91       114
weighted avg       0.92      0.91      0.91       114


Confusion Matrix (Test - basic):
[[39  3]
 [ 7 65]]


In [7]:
# Controlled Decision Tree - avoids overfitting

dt_tuned = DecisionTreeClassifier(
    criterion="gini",        # or "entropy"
    max_depth=4,             # limit depth
    min_samples_split=10,    # node split ki min 10 samples
    min_samples_leaf=5,      # leaf lo min 5 samples
    random_state=42
)

dt_tuned.fit(X_train, y_train)

y_train_pred_tuned = dt_tuned.predict(X_train)
y_test_pred_tuned = dt_tuned.predict(X_test)

print("Train Accuracy (tuned):", accuracy_score(y_train, y_train_pred_tuned))
print("Test Accuracy  (tuned):", accuracy_score(y_test, y_test_pred_tuned))

print("\nClassification Report (Test - tuned):")
print(classification_report(y_test, y_test_pred_tuned))

print("\nConfusion Matrix (Test - tuned):")
print(confusion_matrix(y_test, y_test_pred_tuned))


Train Accuracy (tuned): 0.9758241758241758
Test Accuracy  (tuned): 0.9210526315789473

Classification Report (Test - tuned):
              precision    recall  f1-score   support

           0       0.88      0.90      0.89        42
           1       0.94      0.93      0.94        72

    accuracy                           0.92       114
   macro avg       0.91      0.92      0.92       114
weighted avg       0.92      0.92      0.92       114


Confusion Matrix (Test - tuned):
[[38  4]
 [ 5 67]]
