In [None]:
import numpy as np
import pandas as pd

In [None]:
# get iris dataset
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
class Node():
    def __init__(self, predicted_class):
        self.feature_index = 0
        self.threshold = 0
        self.left = None
        self.right = None
        self.info_gain = 0

        # for the leaf node, majority class of the leaf node (predicted class)
        self.value = 0

In [None]:
def get_gini_impurity(y):
    classes, counts = np.unique(y, return_counts=True)
    p = counts / len(y)
    gini = 1 - np.sum(p**2)
    return gini

In [None]:
def find_best_split(X,y):
  best_gini = float('inf')
  best_feature, best_threshold = None, None

  for feature in range(X.shape[1]):
    thresholds = np.unique(X[:,feature])
    for threshold in thresholds:
      left_indices = X[:, feature] <= threshold
      right_indices = X[:, feature] > threshold

      if np.sum(left_indices) == 0 or np.sum(right_indices) == 0:
        continue

      left_gini = get_gini_impurity(y[left_indices])
      right_gini = get_gini_impurity(y[right_indices])
      gini = (np.sum(left_indices) * left_gini + np.sum(right_indices) * right_gini) / len(y)

      if gini < best_gini:
        best_gini = gini
        best_feature = feature
        best_threshold = threshold

  return best_feature, best_threshold

In [None]:
def build_tree(X, y, depth=0, max_depth=5, min_samples_split=2):
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))

    # stopping criteria
    if depth >= max_depth or n_samples < min_samples_split or n_classes == 1:
        return Node(np.argmax(np.bincount(y)))

    best_feature, best_threshold = find_best_split(X, y)
    left_indices = X[:, best_feature] <= best_threshold
    right_indices = X[:, best_feature] > best_threshold

    left_child = build_tree(X[left_indices], y[left_indices], depth + 1, max_depth, min_samples_split)
    right_child = build_tree(X[right_indices], y[right_indices], depth + 1, max_depth, min_samples_split)

    return Node(feature=best_feature, threshold = best_threshold, left = left_child, right = right_child)

In [None]:
def predict(node, X):
  if node.value is not None:
    return node.value

  if X[node.feature_index] <= node.threshold:
    return predict(node.left, X)
  else:
    return predict(node.right, X)

In [None]:
class DecisionTree:
  def __init__(self, max_depth=5, min_samples_split=2):
    self.max_depth = max_depth
    self.min_samples_split = min_samples_split
    self.root = None

  def fit(self, X, y):
    self.root = build_tree(X, y, self.max_depth, self.min_samples_split)

  def predict(self, X):
    return [predict(self.root, x) for x in X]


In [None]:
from sklearn.model_selection import train_test_split

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Split data

tree = DecisionTree(max_depth=3)
tree.fit(X_train, y_train)

predictions = tree.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')  # For multi-class
recall = recall_score(y_test, predictions, average='weighted')      # For multi-class
f1 = f1_score(y_test, predictions, average='weighted')            # For multi-class
conf_matrix = confusion_matrix(y_test, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.4222222222222222
Precision: 0.1782716049382716
Recall: 0.4222222222222222
F1-Score: 0.25069444444444444
Confusion Matrix:
[[19  0  0]
 [13  0  0]
 [13  0  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
