In [1]:
from sklearn.model_selection import train_test_split
from sklearn import datasets
import numpy as np

In [2]:
class Node:
    
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index
        self.t = t  # порог
        self.true_branch = true_branch
        self.false_branch = false_branch
        

class Leaf:
    
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
        self.prediction_classification = self.predict()
        self.prediction_regression = self.predict_reg()
        
    def predict(self):
        classes = {}
        for label in self.labels:
            if label not in classes:
                classes[label] = 0
            classes[label] += 1
        # класс, количество объектов которого максимальное в листе
        prediction = max(classes, key=classes.get)
        return prediction  

    def predict_reg(self):
        prediction = np.mean(self.labels)
        return prediction
    
# Расчет критерия Джини
def gini(labels):
    classes = {}
    for label in labels:
        if label not in classes:
            classes[label] = 0
        classes[label] += 1

    impurity = 1
    for label in classes:
        p = classes[label] / len(labels)
        impurity -= p ** 2
        
    return impurity

# Расчет качества для задачи классификации
def quality(left_labels, right_labels, current_gini):

    # доля выбоки в левое поддерево
    p = float(left_labels.shape[0]) / (left_labels.shape[0] + right_labels.shape[0])
    
    return current_gini - p * gini(left_labels) - (1 - p) * gini(right_labels)

# Разбиение в узле
def split(data, labels, index, t):
    
    left = np.where(data[:, index] <= t)
    right = np.where(data[:, index] > t)
        
    true_data = data[left]
    false_data = data[right]
    true_labels = labels[left]
    false_labels = labels[right]
        
    return true_data, false_data, true_labels, false_labels

# Нахождение наилучшего разбиения
def find_best_split(data, labels):

    min_leaf = 5

    current_gini = gini(labels)

    best_quality = 0
    best_t = None
    best_index = None
    
    n_features = data.shape[1]
    
    for index in range(n_features):
        t_values = np.unique([row[index] for row in data])
        
        for t in t_values:
            true_data, false_data, true_labels, false_labels = split(data, labels, index, t)
            if len(true_data) < min_leaf or len(false_data) < min_leaf:
                continue
            
            current_quality = quality(true_labels, false_labels, current_gini)

            if current_quality > best_quality:
                best_quality, best_t, best_index = current_quality, t, index

    return best_quality, best_t, best_index

# Построение дерева классификации
def build_tree_classification(data, labels, tree_depth=1, max_depth=50):

    quality, t, index = find_best_split(data, labels)

    if quality == 0:
        return Leaf(data, labels)

    if tree_depth >= max_depth:
        return Leaf(data, labels)

    tree_depth += 1

    true_data, false_data, true_labels, false_labels = split(data, labels, index, t)

    true_branch = build_tree_classification(true_data, true_labels, tree_depth, max_depth)
    false_branch = build_tree_classification(false_data, false_labels, tree_depth, max_depth)

    return Node(index, t, true_branch, false_branch)

def classify_object(obj, node):

    if isinstance(node, Leaf):
        answer = node.prediction_classification
        return answer

    if obj[node.index] <= node.t:
        return classify_object(obj, node.true_branch)
    else:
        return classify_object(obj, node.false_branch)
    
def predict_class(data, tree):
    
    classes = []
    for obj in data:
        prediction = classify_object(obj, tree)
        classes.append(prediction)
    return classes

def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y)


In [3]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:

custom_tree = build_tree_classification(X_train, y_train, max_depth=3)

y_train_pred = predict_class(X_train, custom_tree)
y_test_pred = predict_class(X_test, custom_tree)

print(f'Accuracy score (original train data): {np.round(accuracy(y_train, y_train_pred), 4)}')
print(f'Accuracy score (original test data): {np.round(accuracy(y_test, y_test_pred), 4)}')

Accuracy score (original train data): 0.7733
Accuracy score (original test data): 0.1867
