In [21]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.datasets import make_classification

In [22]:
# Генерация синтетических данных
np.random.seed(42)
X, y = make_classification(n_samples=3000, n_features=20, random_state=42)

# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
class DecisionTree():
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

    def fit(self, X, y):
        self.X = X
        self.y = y
        self.n_classes_ = len(np.unique(y))
        self.n_samples, self.n_features = X.shape
        self.tree_ = self._grow_tree(X, y, depth=0)

    def predict(self, X):
        return np.array([self._predict(inputs) for inputs in X])

    def _predict(self, inputs):
        node = self.tree_
        while not node.is_leaf:
            if inputs[node.feature_index] < node.threshold:
                node = node.left_child
            else:
                node = node.right_child
        return node.value

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if (n_labels == 1) or (n_samples < self.min_samples_split) or \
           (self.max_depth is not None and depth >= self.max_depth):
            leaf_value = self._most_common_label(y)
            return DecisionTreeNode(value=leaf_value)

        feature_indices = np.arange(n_features)
        feature_index, threshold = self._best_split(X, y, feature_indices)

        if feature_index is None:
            leaf_value = self._most_common_label(y)
            return DecisionTreeNode(value=leaf_value)

        left_indices = X[:, feature_index] < threshold
        right_indices = ~left_indices
        left_child = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_child = self._grow_tree(X[right_indices], y[right_indices], depth + 1)
        return DecisionTreeNode(feature_index=feature_index, threshold=threshold,
                                left_child=left_child, right_child=right_child)

    def _best_split(self, X, y, feature_indices):
        m, n = X.shape
        if m <= 1:
            return None, None

        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None

        for idx in feature_indices:
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
        return best_idx, best_thr

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

class DecisionTreeNode:
    def __init__(self, feature_index=None, threshold=None,
                 left_child=None, right_child=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left_child = left_child
        self.right_child = right_child
        self.value = value

    @property
    def is_leaf(self):
        return self.value is not None
    

def print_tree(node, depth=0):
    indent = "  " * depth
    if node.is_leaf:
        print(indent + f"Class: {node.value}")
    else:
        print(indent + f"Feature {node.feature_index} <= {node.threshold}")
        print_tree(node.left_child, depth + 1)
        print_tree(node.right_child, depth + 1)

In [24]:
def gradient_boosting(X_train, y_train, X_test, n_estimators, learning_rate, max_depth):
    # Инициализируем список для хранения базовых моделей
    base_models = []
    
    # Инициализируем список для хранения весов базовых моделей
    model_weights = []
    
    # Инициализируем предсказания для обучающей и тестовой выборок
    train_predictions = np.zeros(len(X_train))
    test_predictions = np.zeros(len(X_test))
    
    for i in range(n_estimators):
        # Создаем и обучаем базовую модель (DecisionTree)
        base_model = DecisionTree(max_depth=max_depth)
        base_model.fit(X_train, y_train)
        
        # Вычисляем ошибку (разницу между реальными метками и предсказаниями)
        errors = y_train - train_predictions
        
        # Вычисляем вес базовой модели как learning_rate умноженное на ошибку
        model_weight = learning_rate * errors.mean()
        
        # Обновляем предсказания для обучающей и тестовой выборок
        train_predictions += model_weight * base_model.predict(X_train)
        test_predictions += model_weight * base_model.predict(X_test)
        
        # Добавляем базовую модель и ее вес в списки
        base_models.append(base_model)
        model_weights.append(model_weight)
    
    # Вычисляем финальные предсказания модели
    final_predictions = np.sign(test_predictions)
    
    return base_models, model_weights, final_predictions

n_estimators = 10
learning_rate = 0.1
max_depth = 2

base_models, model_weights, final_predictions = gradient_boosting(X_train, y_train, X_test, n_estimators, learning_rate, max_depth)

In [25]:
# вычисляем точность (accuracy) модели
accuracy = accuracy_score(y_test, final_predictions)
# вычисляем точность (precision)
precision = precision_score(y_test, final_predictions)
# вычисляем полноту (recall)
recall = recall_score(y_test, final_predictions)
# вычисляем F1-меру
f1 = f1_score(y_test, final_predictions)

print("Accuracy:", accuracy)
print("Точность (Precision):", precision)
print("Полнота (Recall):", recall)
print("F1-мера:", f1)

print("\nmodel_weights:\n", model_weights)

for i,m in enumerate(base_models):
    print("\n")
    print(f"Structure of Decision Tree {i + 1}:")
    print_tree(m.tree_)

Accuracy: 0.9666666666666667
Точность (Precision): 0.9824561403508771
Полнота (Recall): 0.9491525423728814
F1-мера: 0.9655172413793103

model_weights:
 [0.050333333333333334, 0.04781666666666667, 0.04542583333333333, 0.04315454166666666, 0.04099681458333333, 0.03894697385416667, 0.036999625161458334, 0.035149643903385416, 0.03339216170821614, 0.03172255362280534]


Structure of Decision Tree 1:
Feature 19 <= 0.2086028319948805
  Feature 19 <= -0.04151514966828276
    Class: 0
    Class: 0
  Feature 19 <= 0.31243929964520023
    Class: 0
    Class: 1


Structure of Decision Tree 2:
Feature 19 <= 0.2086028319948805
  Feature 19 <= -0.04151514966828276
    Class: 0
    Class: 0
  Feature 19 <= 0.31243929964520023
    Class: 0
    Class: 1


Structure of Decision Tree 3:
Feature 19 <= 0.2086028319948805
  Feature 19 <= -0.04151514966828276
    Class: 0
    Class: 0
  Feature 19 <= 0.31243929964520023
    Class: 0
    Class: 1


Structure of Decision Tree 4:
Feature 19 <= 0.2086028319948805