<a href="https://colab.research.google.com/github/Vidarr22/BML-LCA-2-3-./blob/main/Exp_8_(Decision_Tree).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Experiment 8: Build the Decision Tree model on suitable dataset.


Breast Cancer Wisconsin Dataset

Built-In

In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import tree
import matplotlib.pyplot as plt

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

# Train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# Train Decision Tree
dt = DecisionTreeClassifier(criterion="gini", max_depth=4, random_state=42)
dt.fit(X_train, y_train)

# Prediction
y_pred = dt.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.951048951048951

Confusion Matrix:
 [[51  3]
 [ 4 85]]

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.94        54
           1       0.97      0.96      0.96        89

    accuracy                           0.95       143
   macro avg       0.95      0.95      0.95       143
weighted avg       0.95      0.95      0.95       143



Manual

In [3]:

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

# Load dataset
data = load_breast_cancer()
X = data.data
y = data.target

df = pd.DataFrame(X, columns=data.feature_names)
df['label'] = y

# Entropy Function
def entropy(y):
    values, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return -np.sum(probs * np.log2(probs))

# Information Gain
def information_gain(y, left_y, right_y):
    H = entropy(y)
    left_ratio = len(left_y) / len(y)
    right_ratio = len(right_y) / len(y)
    return H - (left_ratio * entropy(left_y) + right_ratio * entropy(right_y))

# Best Split Function

def best_split(X, y):
    best_feature = None
    best_threshold = None
    best_gain = -1

    n_features = X.shape[1]

    for feature in range(n_features):
        thresholds = np.unique(X[:, feature])
        for t in thresholds:
            left = y[X[:, feature] <= t]
            right = y[X[:, feature] > t]
            if len(left) == 0 or len(right) == 0:
                continue

            gain = information_gain(y, left, right)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = t

    return best_feature, best_threshold, best_gain

# Tree Node Class
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# Build Tree
def build_tree(X, y, depth=0, max_depth=3):
    if len(np.unique(y)) == 1:
        return Node(value=y[0])

    if depth >= max_depth:
        return Node(value=np.argmax(np.bincount(y)))

    feature, threshold, gain = best_split(X, y)

    if gain == -1:
        return Node(value=np.argmax(np.bincount(y)))

    left_idx = X[:, feature] <= threshold
    right_idx = X[:, feature] > threshold

    left = build_tree(X[left_idx], y[left_idx], depth+1, max_depth)
    right = build_tree(X[right_idx], y[right_idx], depth+1, max_depth)

    return Node(feature, threshold, left, right)


# Predict one sample
def predict_one(node, x):
    while node.value is None:
        if x[node.feature] <= node.threshold:
            node = node.left
        else:
            node = node.right
    return node.value


# Predict
def predict(tree, X):
    return np.array([predict_one(tree, x) for x in X])

# Train-test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

# Build manual decision tree
manual_tree = build_tree(X_train, y_train, max_depth=3)

# Prediction
y_pred_manual = predict(manual_tree, X_test)

# Accuracy
accuracy_manual = np.mean(y_pred_manual == y_test)

print("Manual Decision Tree Accuracy:", accuracy_manual)

Manual Decision Tree Accuracy: 0.951048951048951
