<a href="https://colab.research.google.com/github/Polinysha/-/blob/main/%D0%B4%D0%B5%D1%80%D0%B5%D0%B2%D1%8C%D1%8F_%D0%B8_%D0%B1%D1%83%D1%81%D1%82%D0%B8%D0%BD%D0%B3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from operator import itemgetter

In [None]:
VERY_BIG = 1e90

In [None]:
def split_criterion(x, x_treshold, target, type = 'entropy'):
  assert len(x) == len(target)
  if type not in ['MSE', 'entropy', 'gini_impurity']:
    return 'incorrect criterion type'
  x = pd.Series(x)
  target = pd.Series(target, index = x.index)
  target_left, target_right = target[x < x_treshold], target[x >= x_treshold]
  if (not len(target_left)) or (not len(target_left)):
    return VERY_BIG
  if type == 'MSE':
    x_left, x_right = x[x < x_treshold], x[x >= x_treshold]
    prediction_left, prediction_right = np.mean(x_left), np.mean(x_right)
    mse_left, mse_right = sum((target_left - prediction_left) ** 2), sum((target_right - prediction_right) ** 2)
    return (len(target_left) * mse_left + len(target_right) * mse_right) / len(target)

  p_left, p_right = np.mean(target_left), np.mean(target_right)
  if type == 'gini_impurity':
    return 1 - (p_left ** 2 + p_right ** 2)
  if type == 'entropy':
    if (not p_left) or (not p_right):
      return VERY_BIG
    else:
      return -p_left * np.log2(p_left) -p_right * np.log2(p_right)

In [None]:
feature = [5, 6, 7, 8, 9, 10, 2]
target = [1, 0, 1, 0, 1, 1, 1]
t = []
for feature_treshold in feature[0:-1]:
  t.append([feature_treshold, split_criterion(feature, feature_treshold, target, type = 'entropy')])
t = sorted(t, reverse = True, key = itemgetter(1))
print(tabulate(t, headers = ['Порог', 'Значение критерия']))

  Порог    Значение критерия
-------  -------------------
      7             0.701253
      8             0.701253
      6             0.442179
      9             0.442179
      5             0.389975
     10             0.389975


In [None]:
feature = pd.Series(feature)
target = pd.Series(target, index = feature.index)
print(np.mean(target[feature < 5]), np.mean(target[feature >= 5]))
print(np.mean(target[feature < 10]), np.mean(target[feature >= 10]))

1.0 0.6666666666666666
0.6666666666666666 1.0


In [None]:
from sklearn.datasets import make_classification

In [None]:
random_state = 88
x, y = make_classification(5_000, 10, random_state = random_state)

In [None]:
%%time
best_splits = []
for idx_feature, feature in enumerate(x.T):
  t = []
  for feature_treshold in feature[0:-1]:
    t.append([feature_treshold, split_criterion(feature, feature_treshold, y, type = 'entropy')])
  best_split_index = np.nanargmin(np.array(t)[:, 1], axis = 0)
  best_splits.append([idx_feature, t[best_split_index][0], t[best_split_index][1]])
best_splits = sorted(best_splits, reverse = True, key = itemgetter(1))
print(tabulate(best_splits, headers = ['Feature', 'Порог', 'Значение критерия']))
print('the winner is')
best_of_best_split = np.nanargmin(np.array(best_splits)[:,2], axis = 0)
print(tabulate([best_splits[best_of_best_split]], headers = ['Feature', 'Порог', 'Значение критерия']))

  Feature     Порог    Значение критерия
---------  --------  -------------------
        1   4.34112             0.499956
        9   4.18103             0.499956
        6   3.51631             0.499956
        2   3.46727             0.499956
        5   3.24306             0.964164
        4   2.29289             0.499956
        7  -1.10045             0.468756
        8  -3.53499             0.499956
        3  -3.54405             0.499956
        0  -3.85811             0.499956
the winner is
  Feature     Порог    Значение критерия
---------  --------  -------------------
        7  -1.10045             0.468756
CPU times: user 34.3 s, sys: 25 ms, total: 34.3 s
Wall time: 34.7 s


In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from operator import itemgetter
from tabulate import tabulate

random_state = 88
x, y = make_classification(n_samples=5000, n_features=10, random_state=random_state)

def split_criterion(feature, threshold, target, type='entropy'):
    left_mask = feature < threshold
    right_mask = feature >= threshold
    if type == 'entropy':
        def entropy(y_sub):
            probs = np.bincount(y_sub) / len(y_sub)
            probs = probs[probs > 0]
            return -np.sum(probs * np.log2(probs))
        return (entropy(target[left_mask]) * np.sum(left_mask)/len(target) +
                entropy(target[right_mask]) * np.sum(right_mask)/len(target))
    else:
        raise ValueError("Unsupported type")

n_bins = 128
best_splits = []

for idx_feature, feature in enumerate(x.T):
    t = []
    feature_binned, bins = pd.qcut(feature, q=n_bins, retbins=True, duplicates='drop')

    for feature_treshold in bins[1:-1]:
        t.append([feature_treshold, split_criterion(feature, feature_treshold, y, type='entropy')])
    best_split_index = np.nanargmin(np.array(t)[:,1])
    best_splits.append([idx_feature, t[best_split_index][0], t[best_split_index][1]])

best_splits = sorted(best_splits, reverse=True, key=itemgetter(1))
print(tabulate(best_splits, headers=['Feature', 'Порог', 'Значение критерия']))

best_of_best_split = np.nanargmin(np.array(best_splits)[:,2])
print('the winner is')
print(tabulate([best_splits[best_of_best_split]], headers=['Feature', 'Порог', 'Значение критерия']))


  Feature       Порог    Значение критерия
---------  ----------  -------------------
        1   2.3817                0.99547
        6   2.37214               0.999767
        5   0.556681              0.999118
        8   0.481077              0.999311
        9   0.24978               0.999606
        7  -0.0214578             0.515004
        0  -0.104406              0.650301
        3  -0.28008               0.998993
        4  -1.25302               0.995699
        2  -2.42812               0.999467
the winner is
  Feature       Порог    Значение критерия
---------  ----------  -------------------
        7  -0.0214578             0.515004


In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

class simple_gradient_boosting_regressor():
    def __init__(self, n_estimators=20, max_depth=5, learning_rate=0.2, random_state=55):
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.random_state = random_state
        self.n_estimators = n_estimators
        self.trees = []

    def fit(self, x, y, verbose=True):
        preds = np.full(y.shape, np.mean(y))

        for idx in range(self.n_estimators):
            residuals = y - preds

            current_tree = DecisionTreeRegressor(max_depth=self.max_depth, random_state=self.random_state)
            current_tree.fit(x, residuals)

            self.trees.append(current_tree)

            preds += self.learning_rate * current_tree.predict(x)

            if verbose:
                mse = mean_squared_error(y, preds)
                print(f'MSE after {idx+1} tree is {mse:.3f}')

    def predict(self, x):
        preds = np.zeros(x.shape[0])
        for tree in self.trees:
            preds += self.learning_rate * tree.predict(x)
        return preds

random_state = 55
x, y = make_regression(n_samples=1000, n_features=5, noise=10.0, random_state=random_state)

gbr = simple_gradient_boosting_regressor()
gbr.fit(x, y)

preds = gbr.predict(x)
mse_final = mean_squared_error(y, preds)
print(f"Final MSE: {mse_final:.3f}")


MSE after 1 tree is 11231.569
MSE after 2 tree is 7905.176
MSE after 3 tree is 5671.326
MSE after 4 tree is 4103.798
MSE after 5 tree is 3015.955
MSE after 6 tree is 2236.107
MSE after 7 tree is 1696.025
MSE after 8 tree is 1293.273
MSE after 9 tree is 1000.082
MSE after 10 tree is 786.090
MSE after 11 tree is 633.393
MSE after 12 tree is 511.637
MSE after 13 tree is 421.296
MSE after 14 tree is 353.368
MSE after 15 tree is 297.650
MSE after 16 tree is 254.148
MSE after 17 tree is 221.110
MSE after 18 tree is 193.798
MSE after 19 tree is 174.570
MSE after 20 tree is 159.151
Final MSE: 167.269


In [None]:
random_state = 55
x, y = make_regression(1000, 5, random_state = random_state)

In [None]:
gbr = simple_gradient_boosting_regressor()
gbr.fit(x, y)

MSE after 1 tree is 11217.737
MSE after 2 tree is 7910.404
MSE after 3 tree is 5650.280
MSE after 4 tree is 4118.589
MSE after 5 tree is 3032.511
MSE after 6 tree is 2233.586
MSE after 7 tree is 1676.377
MSE after 8 tree is 1283.969
MSE after 9 tree is 984.848
MSE after 10 tree is 774.297
MSE after 11 tree is 603.629
MSE after 12 tree is 483.340
MSE after 13 tree is 392.615
MSE after 14 tree is 321.182
MSE after 15 tree is 267.361
MSE after 16 tree is 226.771
MSE after 17 tree is 193.463
MSE after 18 tree is 169.164
MSE after 19 tree is 149.315
MSE after 20 tree is 132.892
