<a href="https://colab.research.google.com/github/Sameersah/decision-trees-ensemble/blob/main/Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Decision Tree Implementation

Decision trees split data recursively to minimize impurity (e.g., Gini index or entropy).

Steps:
Calculate Impurity: Use criteria like Gini impurity or entropy.
Split Data: Find the best feature and threshold to split the data.
Build Tree: Recursively split until a stopping condition (e.g., max depth or no improvement).
Make Predictions: Traverse the tree for predictions.

In [2]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        if n_samples == 0 or depth == self.max_depth:
            return np.mean(y)

        # Find best split
        best_feature, best_threshold = self._find_best_split(X, y)
        if best_feature is None:
            return np.mean(y)

        # Split data
        left_mask = X[:, best_feature] < best_threshold
        right_mask = ~left_mask
        left_tree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_tree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return {"feature": best_feature, "threshold": best_threshold, "left": left_tree, "right": right_tree}

    def _find_best_split(self, X, y):
        best_feature, best_threshold, best_impurity = None, None, float("inf")
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_mask = X[:, feature] < threshold
                right_mask = ~left_mask
                impurity = self._gini_impurity(y[left_mask], y[right_mask])
                if impurity < best_impurity:
                    best_impurity = impurity
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold

    def _gini_impurity(self, left, right):
        def gini(group):
            proportions = np.bincount(group) / len(group)
            return 1 - np.sum(proportions ** 2)
        return (len(left) * gini(left) + len(right) * gini(right)) / (len(left) + len(right))

    def predict(self, X):
        def traverse(x, tree):
            if not isinstance(tree, dict):
                return tree
            if x[tree["feature"]] < tree["threshold"]:
                return traverse(x, tree["left"])
            else:
                return traverse(x, tree["right"])

        return np.array([traverse(x, self.tree) for x in X])

# Demonstration
dt = DecisionTree(max_depth=3)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


NameError: name 'X_train' is not defined