In [1]:
# --------------------------------------------
# Decision Tree (Manual Implementation)
# --------------------------------------------

data = [
    (2, 0),
    (4, 0),
    (6, 1),
    (8, 1),
    (10, 1)
]

# Function to calculate Gini impurity
def calculate_gini(groups):
    total = sum(len(group) for group in groups)  # total samples
    gini_value = 0.0

    # Go through each group (left & right)
    for group in groups:
        size = len(group)
        if size == 0:
            continue  # skip empty groups

        # Count how many 0s (fail) and 1s (pass)
        score = 0.0
        for class_value in [0, 1]:
            proportion = [row[1] for row in group].count(class_value) / size
            score += proportion ** 2

        # Add weighted Gini impurity
        gini_value += (1 - score) * (size / total)

    return gini_value


# Try all possible split points
split_points = [3, 5, 7, 9]
print("Gini impurity for different splits:\n")

for split in split_points:
    left_group = [row for row in data if row[0] <= split]
    right_group = [row for row in data if row[0] > split]
    gini_val = calculate_gini([left_group, right_group])
    print(f"If we split at {split} hours → Gini impurity = {gini_val:.3f}")

print("\nBest split is at 5 hours (lowest Gini = 0.000)")
print("Meaning: Students who study ≤ 5 hours → Fail, and > 5 hours → Pass.")


# --------------------------------------------
# Optional: Cross-check using sklearn
# --------------------------------------------
print("\n--- Checking with sklearn ---")

from sklearn.tree import DecisionTreeClassifier, export_text
import numpy as np

# Prepare data for sklearn
X = np.array([[2], [4], [6], [8], [10]])
y = np.array([0, 0, 1, 1, 1])

# Create and train a decision tree (depth = 1 for single split)
tree = DecisionTreeClassifier(criterion='gini', max_depth=1)
tree.fit(X, y)

# Show the resulting tree in text form
print(export_text(tree, feature_names=["Study Hours"]))


Gini impurity for different splits:

If we split at 3 hours → Gini impurity = 0.300
If we split at 5 hours → Gini impurity = 0.000
If we split at 7 hours → Gini impurity = 0.267
If we split at 9 hours → Gini impurity = 0.400

Best split is at 5 hours (lowest Gini = 0.000)
Meaning: Students who study ≤ 5 hours → Fail, and > 5 hours → Pass.

--- Checking with sklearn ---
|--- Study Hours <= 5.00
|   |--- class: 0
|--- Study Hours >  5.00
|   |--- class: 1

