In [1]:
from sklearn import datasets
import numpy as np

from tree.classes import DecisionTreeClassifier, DecisionTreeRegressor
from model_selection.kfold import KFold
from ensemble.bagging import Bagging

## Classification

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
clf = DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(X, y)
print(clf.predict(X))

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [4]:
score = clf.score(X, y)
print("Error rate: %.2f" %(score))

Error rate: 0.00


### KFold Cross Validation

In [5]:
clf = DecisionTreeClassifier(criterion="gini")
kf = KFold(5)
for k, (train, test) in enumerate(kf.split(X, y)):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    clf = clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("Fold %d, loss = %.4f" % (k+1, score))

Fold 1, loss = 0.0667
Fold 2, loss = 0.1000
Fold 3, loss = 0.0000
Fold 4, loss = 0.0667
Fold 5, loss = 0.0667


### Bagging

In [6]:
clf = Bagging(DecisionTreeClassifier(), n_estimators=10)
clf.fit(X, y)
y_hat = clf.predict(X)
print(y_hat)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2
 2 2]


In [7]:
score = clf.score(X, y)
print("Error rate: %.2f" %(score))

Error rate: 0.02


In [8]:
clf = Bagging(DecisionTreeClassifier(), n_estimators=10)
kf = KFold(5)
for k, (train, test) in enumerate(kf.split(X, y)):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    clf = clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("Fold %d, loss = %.4f" % (k+1, score))

Fold 1, loss = 0.0667
Fold 2, loss = 0.1333
Fold 3, loss = 0.0000
Fold 4, loss = 0.0333
Fold 5, loss = 0.1000


## Regression

In [9]:
boston = datasets.load_boston()
X = boston.data
y = boston.target

In [10]:
reg = DecisionTreeRegressor()
reg.fit(X, y)
y_hat = reg.predict(X)
print(y_hat)

[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 50.
 33.3 3

In [11]:
score = reg.score(X, y)
print("NMSE: %.2f" %(score))

NMSE: 0.00


In [12]:
reg = DecisionTreeRegressor()
kf = KFold(5)
for k, (train, test) in enumerate(kf.split(X, y)):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    reg = reg.fit(X_train, y_train)
    score = reg.score(X_test, y_test)
    print("Fold %d, loss = %.4f" % (k+1, score))

Fold 1, loss = 0.4951
Fold 2, loss = 0.1695
Fold 3, loss = 0.2865
Fold 4, loss = 0.5666
Fold 5, loss = 0.1316


### Bagging

In [13]:
reg = Bagging(DecisionTreeRegressor(), n_estimators=10)
reg.fit(X, y)
y_hat = reg.predict(X)
print(y_hat)

[24.  21.6 34.7 33.4 36.2 28.7 22.8 27.1 16.5 18.9 15.  22.8 21.7 19.6
 22.6 19.9 23.1 17.5 16.1 19.6 14.1 19.6 15.2 14.5 14.5 13.9 16.6 14.8
 20.5 21.  12.7 14.5 19.  12.7 13.5 18.9 20.  21.  20.9 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 16.8 14.4 14.4 19.4 22.8 20.5 25.  23.4 18.9 33.4
 22.2 31.6 23.3 19.  19.9 16.  22.2 25.  33.4 23.5 19.4 21.  17.4 20.9
 24.2 20.6 22.8 23.4 24.8 21.4 20.  20.5 21.2 20.3 28.  23.9 24.8 24.8
 21.4 28.4 21.  24.4 23.6 23.6 22.6 22.9 22.9 24.8 20.6 28.4 21.4 38.7
 38.7 25.  27.5 26.5 18.6 19.6 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.4 20.4 19.3 22.  20.3 24.4  7.  21.5  7.
  7.  16.2 17.1 14.3 23.  19.6 23.  18.4 15.6 18.4 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.1 17.8 14.6 17.8 13.4 23.8 15.3 15.3 19.4
 19.4 15.6 13.1 23.9 27.5 23.3 27.  37.6 37.6 50.  22.7 25.  50.  23.8
 23.8 22.3 15.2 19.1 23.1 20.5 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 34.9 31.1 29.1 42.3
 33.3 3

In [14]:
score = reg.score(X, y)
print("Error rate: %.2f" %(score))

Error rate: 0.08


In [15]:
reg = Bagging(DecisionTreeRegressor(), n_estimators=10)
kf = KFold(5)
for k, (train, test) in enumerate(kf.split(X, y)):
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
    reg = reg.fit(X_train, y_train)
    score = reg.score(X_test, y_test)
    print("Fold %d, loss = %.4f" % (k+1, score))

Fold 1, loss = 0.4482
Fold 2, loss = 0.2036
Fold 3, loss = 0.2428
Fold 4, loss = 0.7586
Fold 5, loss = 0.1443
