In [2]:
# Run this cell in the same notebook where you saw the error
import sys
print("Python executable:", sys.executable)

# Install scikit-learn and commonly used libs (pandas, matplotlib)
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install scikit-learn pandas matplotlib


Python executable: /opt/anaconda3/envs/ML-Homework-1/bin/python
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.2-cp312-cp312-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp312-cp312-macosx_12_0_arm64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m6.1 MB/s[0m  [33m0:00:01[0m eta [36m0:00:01[0m
[?25hUsing cached joblib-1.5.2-py3-none-any.whl (308 kB)
Downloading scipy-1.16.2-cp312-cp312-macosx_14_0_arm64.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m6.2 MB/s[0m  [33m0:00:03[0mm0:00:01[0m00:01[0m
[?25hD

In [11]:
# imports
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
#  Q1: Use sklearn.tree.DecisionTreeClassifier on the Iris dataset
# (loads data and creates a train/test split)
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris = load_iris()
X, y = iris.data, iris.target
feature_names = iris.feature_names
target_names = iris.target_names

# 30% test set, stratified to preserve class proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

print(f"Loaded Iris dataset. Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")


Loaded Iris dataset. Train size: 105, Test size: 45


In [12]:
# Q2: Train trees with max_depth = 1, 2, 3
from sklearn.tree import DecisionTreeClassifier

depths = [1, 2, 3]
models = {}
for d in depths:
    clf = DecisionTreeClassifier(max_depth=d, random_state=42)
    clf.fit(X_train, y_train)
    models[d] = clf

print("Trained DecisionTreeClassifier models for max_depth =", depths)


Trained DecisionTreeClassifier models for max_depth = [1, 2, 3]


In [13]:
# Q3: Report training and test accuracy for each depth
from sklearn.metrics import accuracy_score
import pandas as pd

rows = []
for d in sorted(models):
    clf = models[d]
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc  = accuracy_score(y_test, clf.predict(X_test))
    rows.append({"max_depth": d, "train_accuracy": train_acc, "test_accuracy": test_acc})

results_df = pd.DataFrame(rows).sort_values("max_depth").set_index("max_depth")
print("Train and Test accuracies (rounded):")
print(results_df.round(4))


Train and Test accuracies (rounded):
           train_accuracy  test_accuracy
max_depth                               
1                  0.6667         0.6667
2                  0.9714         0.8889
3                  0.9810         0.9778


In [14]:
# Q4: signs of underfitting vs overfitting
"""
Signs of underfitting
- Low train accuracy (model can't even fit training data).
  Example: max_depth = 1 -> train ≈ 66.7%.
- Train ≈ Test and both low. If both are low and close, the model has high bias
  and is too simple.
- Simple decision boundaries, high residual errors.
- Learning curves: training and validation curves both low and close (flat, low
  performance as data increases).

Remedies for underfitting:
- Increase model capacity (higher max_depth), add informative features,
  remove excessive regularization, or use more expressive models.

Signs of overfitting
- Large positive gap: train accuracy ≫ test accuracy (e.g., train ~100%, test much lower).
- High variance in cross-validation: large CV std or CV mean much lower than training score.
- Learning curves: training score high, validation score low and not improving with more data.
- Complex decision boundaries that conform to noise.

Remedies for overfitting:
- Reduce complexity (lower max_depth), prune the tree (ccp_alpha), increase
  min_samples_leaf / min_samples_split.
- Use cross-validation (GridSearchCV) to select hyperparameters.
- Gather more data or use ensemble methods (RandomForest) to reduce variance.

Signs of good fit (balanced)
- High train and test accuracy and small gap (e.g., train ≈ test ≈ high).
- Cross-validation mean ≈ train/test and low std.
- Learning curves: both curves high and converging.
"""

for d in sorted(models):
    clf = models[d]
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc  = accuracy_score(y_test, clf.predict(X_test))
    gap = train_acc - test_acc  # positive => train > test

    print(f"\nmax_depth = {d} | train = {train_acc:.4f} | test = {test_acc:.4f} | gap = {gap:.4f}")
    if train_acc < 0.80 and test_acc < 0.80:
        print("  → Diagnosis: UNDERFITTING (both train & test are low). Action: increase complexity / add features.")
    elif gap > 0.10:
        print("  → Diagnosis: OVERFITTING (train much higher than test). Action: prune, reduce max_depth, or increase min_samples_leaf.")
    else:
        print("  → Diagnosis: GOOD FIT (train & test are both reasonably high and similar).")
        



max_depth = 1 | train = 0.6667 | test = 0.6667 | gap = 0.0000
  → Diagnosis: UNDERFITTING (both train & test are low). Action: increase complexity / add features.

max_depth = 2 | train = 0.9714 | test = 0.8889 | gap = 0.0825
  → Diagnosis: GOOD FIT (train & test are both reasonably high and similar).

max_depth = 3 | train = 0.9810 | test = 0.9778 | gap = 0.0032
  → Diagnosis: GOOD FIT (train & test are both reasonably high and similar).
