<a href="https://colab.research.google.com/github/NitinVerma2027/PRML-Apr2025/blob/main/decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

def load_mnist_csv(filename):
    df = pd.read_csv(filename, header=None)  # CSV Load
    df = df.dropna()  # NaN values hatao (Important)

    try:
        data = df.astype(float).to_numpy()  # Ensure sab numeric ho
    except ValueError:
        print("Error: CSV me non-numeric values hain!")
        return None, None

    labels = data[:, 0].astype(int)  # First column = labels
    images = data[:, 1:].astype(float) / 255.0  # Normalize pixel values
    return images, labels

# Entropy calculate karne ka function
def entropy(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities + 1e-9))  # small value to avoid log(0)

# Best split dhoondhne ka function
def best_split(X, y):
    best_feature, best_threshold, best_gain = None, None, 0
    for feature in range(X.shape[1]):
        values = np.unique(X[:, feature])
        for threshold in values:
            left_mask = X[:, feature] < threshold
            right_mask = ~left_mask
            if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                continue
            left_entropy = entropy(y[left_mask])
            right_entropy = entropy(y[right_mask])
            gain = entropy(y) - (np.sum(left_mask) / len(y)) * left_entropy - (np.sum(right_mask) / len(y)) * right_entropy
            if gain > best_gain:
                best_feature, best_threshold, best_gain = feature, threshold, gain
    return best_feature, best_threshold

# Decision Tree ka node class
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# Tree build karne ka function
def build_tree(X, y, depth=0, max_depth=10):
    if len(np.unique(y)) == 1:
        return Node(value=np.unique(y)[0])
    if depth >= max_depth:
        return Node(value=np.bincount(y).argmax())

    feature, threshold = best_split(X, y)
    if feature is None:
        return Node(value=np.bincount(y).argmax())

    left_mask = X[:, feature] < threshold
    right_mask = ~left_mask
    left_subtree = build_tree(X[left_mask], y[left_mask], depth + 1, max_depth)
    right_subtree = build_tree(X[right_mask], y[right_mask], depth + 1, max_depth)

    return Node(feature=feature, threshold=threshold, left=left_subtree, right=right_subtree)


