In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split 

iris_dataset = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris_dataset.data, 
                                                    iris_dataset['target'].reshape(-1,1),
                                                    test_size=0.2,
                                                    random_state=0) # random_state - для воспроизводимости

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape},\n'
      f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}') # test_size = 0.25

X_train shape: (120, 4), y_train shape: (120, 1),
X_test shape: (30, 4), y_test shape: (30, 1)


In [3]:
df = pd.DataFrame(iris_dataset['data'], columns=iris_dataset.feature_names)
df['type'] = iris_dataset['target']
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),type
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


- Entropy (Энтропия)\
$E = \sum{}{}-p_i\cdot \log{2}{p_i}$, где $p_i =$ *probability of class i*

- Gini Index (Индекс Джини)\
$G = 1 - \sum{}{}(p_i)^2$

- MSE (квадратичная ошибка)\
$squared error = \sum{}{}(y - \widetilde{y})^2$

- MAE (абсолютная ошибка)\
$absolute error = \sum{}{}|y - \widetilde{y}|$

- Information Gain (Прирост информации)\
$IG = E(parent) - \sum{}{}w_i\cdot E(child_i)$

In [4]:
%run -m CART

In [5]:
classifier = ClassificationAndRegressionTrees(min_samples_split=3, max_depth=3, criterion="entropy")
classifier.fit(X_train,y_train)
classifier.print_tree()
# feature_names=list(iris_dataset.feature_names)

X_2 <= 1.700
|   left: 0.0000
|   right: X_3 <= 1.700
|   |   left: X_2 <= 4.900
|   |   |   left: X_3 <= 1.600
|   |   |   |   left: 1.0000
|   |   |   |   right: 2.0000
|   |   |   right: X_3 <= 1.500
|   |   |   |   left: 2.0000
|   |   |   |   right: 1.0000
|   |   right: X_2 <= 4.800
|   |   |   left: 2.0000
|   |   |   right: 2.0000


In [6]:
from sklearn.metrics import accuracy_score

y_pred = classifier.predict(X_test) 
accuracy_score(y_test, y_pred)

0.9333333333333333

In [7]:
from sklearn.datasets import make_regression
x, y = make_regression(n_samples=1000, n_features=20)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y.reshape(-1,1), test_size=0.2, random_state=0)

print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape},\n', f'X_test shape: {X_test.shape}, y_test shape: {y_test.shape}')

X_train shape: (800, 20), y_train shape: (800, 1),
 X_test shape: (200, 20), y_test shape: (200, 1)


In [9]:
regressor = ClassificationAndRegressionTrees(min_samples_split=3, max_depth=3, criterion="squared_error")
regressor.fit(X_train,y_train)
regressor.print_tree()

X_3 <= -0.330
|   left: X_18 <= -0.017
|   |   left: X_15 <= -0.813
|   |   |   left: X_3 <= -1.254
|   |   |   |   left: -422.4918
|   |   |   |   right: -241.0743
|   |   |   right: X_0 <= -0.636
|   |   |   |   left: -232.3643
|   |   |   |   right: -101.2620
|   |   right: X_0 <= -0.371
|   |   |   left: X_12 <= 1.109
|   |   |   |   left: -157.8232
|   |   |   |   right: 52.7832
|   |   |   right: X_18 <= 1.154
|   |   |   |   left: -15.5741
|   |   |   |   right: 175.5184
|   right: X_18 <= -0.118
|   |   left: X_0 <= -0.711
|   |   |   left: X_15 <= 1.101
|   |   |   |   left: -166.1384
|   |   |   |   right: 21.5687
|   |   |   right: X_15 <= -0.565
|   |   |   |   left: -94.0148
|   |   |   |   right: 60.1510
|   |   right: X_0 <= -0.111
|   |   |   left: X_16 <= -0.793
|   |   |   |   left: -56.7804
|   |   |   |   right: 84.1746
|   |   |   right: X_3 <= 0.946
|   |   |   |   left: 154.6824
|   |   |   |   right: 325.1030


In [10]:
y_pred = regressor.predict(X_test)