<a href="https://colab.research.google.com/github/RehaanAzam369/Practice/blob/main/Decision_Tree_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from collections import Counter

1. Gini Impurity Function

In [3]:
# Gini Impurity
def gini(y):
  counts =  Counter(y)
  impurity = 1.0
  for lbl in counts:
    prob_of_lbl = counts[lbl] / float(len(y))
    impurity -= prob_of_lbl**2
  return impurity

2. Spltting the Dataset

In [5]:
# Split Data
def split_data(X, y, feature_indices, threshold):
  left_indices = np.where(X[:, feature_indices] <= threshold)
  right_indices = np.where(X[:, feature_indices] > threshold)
  return X[left_indices], X[right_indices], y[left_indices], y[right_indices]

3. Finding the best split

In [13]:
# Best split finder
def best_split(X, y):
  best_gini = float("inf")
  best_idx = None
  best_threshold = None
  n_features = X.shape[1]
  for feature_idx in range(n_features):
    thresholds = np.unique(X[:, feature_idx])
    for threshold in thresholds:
      X_left, X_right, y_left, y_right = split_data(X, y, feature_idx, threshold)
      if len(y_left) == 0 or len(y_right) == 0:
        continue
      gini_left = gini(y_left)
      gini_right = gini(y_right)
      weighted_gini = (len(y_left) / len(y)) * gini_left + (len(y_right) / len(y)) * gini_right
      if weighted_gini < best_gini:
        best_gini = weighted_gini
        best_idx = feature_idx
        best_threshold = threshold
  return best_idx, best_threshold

4. Tree Node Class

In [8]:
class Node:
  def __init__(self, left = None, right = None, feature = None, threshold = None,*, value = None):
    self.left = left
    self.right = right
    self.feature = feature
    self.threshold = threshold
    self.value = value

5. Recursive Tree Builder

In [14]:
def build_tree(X, y, depth = 0, max_depth = 5):
  num_samples_per_class = Counter(y)
  predicted_class = max(num_samples_per_class, key = num_samples_per_class.get)

  if depth>=max_depth or len(set(y)) == 1:
    return Node(value = predicted_class)

  feature, threshold = best_split(X, y)
  if feature is None:
    return Node(value = predicted_class)
  X_left, X_right, y_left, y_right = split_data(X, y, feature, threshold)
  left_child = build_tree(X_left, y_left, depth + 1, max_depth)
  right_child = build_tree(X_right, y_right, depth + 1, max_depth)
  return Node(left=left_child, right=right_child, feature=feature, threshold=threshold)

6. Prediction for one sample

In [10]:
def predict(sample, tree):
  if tree.value is not None:
    return tree.value
  feature_val = sample[tree.feature]
  if feature_val <= tree.threshold:
    return predict(sample, tree.left)
  else:
    return predict(sample, tree.right)

Prediction for many samples

In [11]:
def predict_batch(X, tree):
  return[predict(sample, tree) for sample in X]

Full Example Recap

In [15]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

tree = build_tree(X_train, y_train, max_depth=3)
y_pred = predict_batch(X_test, tree)

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9
