<a href="https://colab.research.google.com/github/Sabirbinsakander/Data_Science_Practice_Folder/blob/main/Decision_tree_classifier_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None, klass=None):
        self.feat_idx = feature # Renamed to feature to match the parameter name
        self.threshold = threshold
        self.left = left
        self.right = right
        self.klass = klass

class DecisionTree:
    def __init__(self, tree=None, max_depth=None, min_sample_sz=None):
        self.tree = tree
        self.max_depth = max_depth
        self.min_samples_sz = min_sample_sz

    def build_tree(self, dataset, depth=0):
        x, y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_feats = x.shape

        label_count = np.unique(y).shape[0]

        # Termination conditions for recursion
        if depth >= self.max_depth or num_samples < self.min_samples_sz or label_count == 1:
            return Node(klass=self.major_class(y))

        max_gain = -1
        left_data, right_data, best_feat, best_threshold = None, None, None, None

        # Find the best split
        for feat_idx in range(num_feats):
            feat_values = np.unique(dataset[:, feat_idx])
            for value in feat_values:
                left, right = self.split_data(dataset, feat_idx, value)
                if left.shape[0] > 0 and right.shape[0] > 0:
                    y_left, y_right = left[:, -1], right[:, -1]
                    ig = self.info_gain(y, y_left, y_right)
                    if ig > max_gain:
                        max_gain = ig
                        left_data, right_data = left, right
                        best_feat, best_threshold = feat_idx, value

        # If no split provides information gain, return a leaf node
        if max_gain <= 0:
             return Node(klass=self.major_class(y))


        # Recursively build subtrees
        left_subtree = self.build_tree(left_data, depth + 1)
        right_subtree = self.build_tree(right_data, depth + 1)

        return Node(feature=best_feat, threshold=best_threshold, left=left_subtree, right=right_subtree)


    def info_gain(self, parent, l_child, r_child):
        lw = l_child.shape[0] / parent.shape[0]
        rw = r_child.shape[0] / parent.shape[0]
        # Corrected the info gain formula
        return self.entropy(parent) - (lw * self.entropy(l_child) + rw * self.entropy(r_child))

    def entropy(self, y):
        klasses = np.unique(y)
        sum_entropy = 0 # Renamed sum to sum_entropy to avoid shadowing built-in sum
        # Corrected 'kses' to 'klasses'
        for k in klasses:
            p = np.count_nonzero(y == k) / y.shape[0]
            if p > 0: # Avoid log(0)
                sum_entropy += -p * np.log2(p)

        return sum_entropy


    def split_data(self, dataset, feat_idx, threshold):
        left = dataset[dataset[:, feat_idx] <= threshold]
        right = dataset[dataset[:, feat_idx] > threshold]
        return left, right

    def major_class(self, y):
        classes, counts = np.unique(y, return_counts=True)
        idx = np.argmax(counts)
        return classes[idx]

    def fit(self, x, y):
        # Ensure y is a 2D array for concatenation
        if y.ndim == 1:
            y = y.reshape(-1, 1)
        data = np.concatenate((x, y), axis=1)
        self.tree = self.build_tree(data)

    def predict(self, x):
        # Ensure x is treated as a numpy array
        x = np.array(x)
        return np.array([self.traverse_tree(x_i, self.tree) for x_i in x]).reshape(-1, 1)

    def traverse_tree(self, x, node):
        if node.klass is not None:
            return node.klass
        # Corrected the order of arguments in the recursive calls
        if x[node.feat_idx] <= node.threshold:
            return self.traverse_tree(x, node.left)
        else:
            return self.traverse_tree(x, node.right)

    def print_tree(self, tree=None, indent=""):
        if not tree:
            tree = self.tree
        if tree.klass is not None:
            print(tree.klass)
        else:
            print("X_" + str(tree.feat_idx), "<=", tree.threshold, "?")
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + "  ") # Increased indent for clarity
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + "  ") # Increased indent for clarity

# Load and prepare data
df = pd.read_csv('/content/sample_data/Iris.csv')
df = df.loc[:, df.columns != 'Id']

x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values.reshape(-1, 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
classifier = DecisionTree(min_sample_sz=3, max_depth=4)
classifier.fit(x_train, y_train)

# Print the tree
classifier.print_tree()

# Make predictions
y_pred = classifier.predict(x_test)

# Evaluate accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

X_2 <= 1.9 ?
left:Iris-setosa
right:X_2 <= 4.7 ?
  left:X_3 <= 1.6 ?
    left:Iris-versicolor
    right:Iris-virginica
  right:X_3 <= 1.7 ?
    left:X_2 <= 4.9 ?
      left:Iris-versicolor
      right:Iris-virginica
    right:X_2 <= 4.8 ?
      left:Iris-virginica
      right:Iris-virginica
Accuracy: 1.0
