In [1]:
import numpy as np
from collections import Counter
from sklearn.datasets import (load_iris, load_breast_cancer, load_wine, load_digits, load_diabetes, fetch_covtype, fetch_kddcup99, make_classification)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [2]:
class DecisionNode:
    def __init__(self, feature_idx=None, threshold=None, left=None, right=None, value=None):
        self.feature_idx = feature_idx
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def is_leaf(self):
        return self.value is not None

In [3]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=100, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
    
    def _entropy(self, y):
        """Calculate the entropy of a label array."""
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])
    
    def _information_gain(self, X, y, feature_idx, threshold):
        """Calculate the information gain of a split."""
        parent_entropy = self._entropy(y)
        
        # Split the data
        left_mask = X[:, feature_idx] <= threshold
        right_mask = X[:, feature_idx] > threshold
        
        n = len(y)
        n_left, n_right = np.sum(left_mask), np.sum(right_mask)
        
        if n_left == 0 or n_right == 0:
            return 0
        
        # Calculate child entropies
        e_left = self._entropy(y[left_mask])
        e_right = self._entropy(y[right_mask])
        
        # Calculate weighted average of child entropies
        child_entropy = (n_left / n) * e_left + (n_right / n) * e_right
        
        # Information gain is difference in entropy
        return parent_entropy - child_entropy
    
    def _best_split(self, X, y):
        """Find the best split for a node."""
        best_gain = -1
        best_feature_idx, best_threshold = None, None
        
        for feature_idx in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_idx])
            for threshold in thresholds:
                gain = self._information_gain(X, y, feature_idx, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature_idx = feature_idx
                    best_threshold = threshold
        
        return best_feature_idx, best_threshold
    
    def _build_tree(self, X, y, depth=0):
        """Recursively build the decision tree."""
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        # Stopping criteria
        if (depth >= self.max_depth or n_classes == 1 
            or n_samples < self.min_samples_split):
            return DecisionNode(value=self._most_common_label(y))
        
        # Find best split
        feature_idx, threshold = self._best_split(X, y)
        
        # Split the data
        left_mask = X[:, feature_idx] <= threshold
        right_mask = X[:, feature_idx] > threshold
        # Recursively build left and right subtrees
        left = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right = self._build_tree(X[right_mask], y[right_mask], depth + 1)
        
        return DecisionNode(feature_idx, threshold, left, right)
    
    def _most_common_label(self, y):
        """Find the most common class label in y."""
        counter = Counter(y)
        return counter.most_common(1)[0][0]
    
    def fit(self, X, y):
        """Build the decision tree."""
        self.root = self._build_tree(X, y)
    
    def _traverse_tree(self, x, node):
        """Traverse the tree to make a prediction for a single sample."""
        if node.is_leaf():
            return node.value
        
        if x[node.feature_idx] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    
    def predict(self, X):
        """Predict class labels for samples in X."""
        return np.array([self._traverse_tree(x, self.root) for x in X])

In [4]:
def test_on_dataset(dataset_name, max_depth=None, test_size=0.2):
    """Test our decision tree on different datasets"""
    print(f"\n{'='*50}")
    print(f"Testing on {dataset_name} dataset")
    
    # Load dataset
    if dataset_name == "iris":
        data = load_iris()
    elif dataset_name == "breast_cancer":
        data = load_breast_cancer()
    elif dataset_name == "wine":
        data = load_wine()
    elif dataset_name == "digits":
        data = load_digits()
    elif dataset_name == "diabetes":
        data = load_diabetes()
        # Convert regression to classification for diabetes
        y = np.digitize(data.target, bins=[0, 100, 200, np.inf])
        data.target = y
    elif dataset_name == "forest_cover":
        data = fetch_covtype()
    elif dataset_name == "network_intrusion":
        data = fetch_kddcup99()
        # Simplify to binary classification
        y = (data.target == b'normal.').astype(int)
        data.target = y
    elif dataset_name == "synthetic":
        X, y = make_classification(
            n_samples=1000, n_features=10,
            n_classes=3, n_clusters_per_class=1,
            random_state=42
        )
        data = type('', (), {'data': X, 'target': y, 'feature_names': None})()
    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")
    
    X = data.data
    y = data.target
    
    # For large datasets, take a subset
    if len(y) > 10000:
        X, _, y, _ = train_test_split(X, y, train_size=10000, random_state=42)
        print("Note: Using subset of 10,000 samples for faster execution")
    
    # Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
    
    # Determine default max_depth if not provided
    if max_depth is None:
        max_depth = 5 if X.shape[1] > 10 else 3
    
    # Train our decision tree
    print(f"\nTraining Decision Tree (max_depth={max_depth})...")
    clf = DecisionTreeClassifier(max_depth=max_depth)
    clf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = clf.predict(X_test)
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nResults:")
    print(f"- Accuracy: {accuracy:.4f}")
    print(f"- Tree depth used: {max_depth}")
    print(f"- Dataset shape: {X.shape}")
    print(f"- Features: {X.shape[1]}")
    print(f"- Classes: {len(np.unique(y))}")
    
    # Print some example predictions
    print("\nSample predictions (True -> Predicted):")
    for i in range(min(5, len(X_test))):
        print(f"{y_test[i]} -> {y_pred[i]}")

In [5]:
def test_on_csv(filepath, target_column, max_depth=3):
    """Test on custom CSV dataset"""
    print(f"\n{'='*50}")
    print(f"Testing on custom dataset from {filepath}")
    
    try:
        # Load CSV
        df = pd.read_csv(filepath)
        
        # Separate features and target
        if target_column not in df.columns:
            raise ValueError(f"Target column '{target_column}' not found in CSV")
            
        X = df.drop(columns=[target_column]).values
        y = df[target_column].values
        
        # Encode labels if needed
        if y.dtype == object:
            le = LabelEncoder()
            y = le.fit_transform(y)
            print("Note: Encoded string labels to numerical values")
        
        # Split into train and test
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        # Train our decision tree
        print(f"\nTraining Decision Tree (max_depth={max_depth})...")
        clf = DecisionTreeClassifier(max_depth=max_depth)
        clf.fit(X_train, y_train)
        
        # Make predictions
        y_pred = clf.predict(X_test)
         # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"\nResults:")
        print(f"- Accuracy: {accuracy:.4f}")
        print(f"- Dataset shape: {X.shape}")
        print(f"- Features: {X.shape[1]}")
        print(f"- Classes: {len(np.unique(y))}")
        
        # Print some example predictions
        print("\nSample predictions (True -> Predicted):")
        for i in range(min(5, len(X_test))):
            print(f"{y_test[i]} -> {y_pred[i]}")
    
    except Exception as e:
        print(f"\nError processing CSV file: {str(e)}")
        print("Make sure the file exists and is properly formatted")

In [6]:
def main():
    """Main function to run tests"""
    print("Decision Tree Classifier Implementation")
    print("="*50)
    
    # Test on built-in datasets
    test_on_dataset("iris")
    test_on_dataset("breast_cancer", max_depth=4)
    test_on_dataset("wine")
    test_on_dataset("digits", max_depth=10)
    test_on_dataset("diabetes")
    test_on_dataset("forest_cover", max_depth=5)
    test_on_dataset("network_intrusion", max_depth=5)
    test_on_dataset("synthetic")
    
    # Uncomment to test on your own CSV file
    # print("\nTo test on your own CSV file, uncomment and modify the following line:")
    # print('test_on_csv("path/to/your_data.csv", target_column="column_name")')

if __name__ == "__main__":
    main()

Decision Tree Classifier Implementation

Testing on iris dataset

Training Decision Tree (max_depth=3)...

Results:
- Accuracy: 0.9667
- Tree depth used: 3
- Dataset shape: (150, 4)
- Features: 4
- Classes: 3

Sample predictions (True -> Predicted):
1 -> 1
0 -> 0
2 -> 2
1 -> 1
1 -> 2

Testing on breast_cancer dataset

Training Decision Tree (max_depth=4)...

Results:
- Accuracy: 0.9386
- Tree depth used: 4
- Dataset shape: (569, 30)
- Features: 30
- Classes: 2

Sample predictions (True -> Predicted):
1 -> 1
0 -> 0
0 -> 0
1 -> 1
1 -> 1

Testing on wine dataset

Training Decision Tree (max_depth=5)...

Results:
- Accuracy: 0.9167
- Tree depth used: 5
- Dataset shape: (178, 13)
- Features: 13
- Classes: 3

Sample predictions (True -> Predicted):
0 -> 0
0 -> 0
2 -> 1
0 -> 0
1 -> 1

Testing on digits dataset

Training Decision Tree (max_depth=10)...

Results:
- Accuracy: 0.8833
- Tree depth used: 10
- Dataset shape: (1797, 64)
- Features: 64
- Classes: 10

Sample predictions (True -> Predic