Written by: Mbongiseni Dlamini
Date: 24/04/20

# Trees

## 1) Creating and Fitting Decision trees: an example from the book

In [1]:
from sklearn.tree import DecisionTreeClassifier #get class
cancer = load_breast_cancer() #load data
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42) #split into train and test data
tree = DecisionTreeClassifier(random_state=0) #instantiate
tree.fit(X_train, y_train) #train model
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

NameError: name 'load_breast_cancer' is not defined

As we cannot run the code, here are the results repported in the book:
Accuracy on training set: 1.000
Accuracy on test set: 0.937

Now, let's do some pre-pruning by limiting the maximum depth of the tree:

In [None]:
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

result:
Accuracy on training set: 0.988
Accuracy on test set: 0.951

training accuracy decreased but test accuracy decreased, which is good.

### 2) After fitting the tree, we can analyse it.

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(tree, out_file="tree.dot", class_names=["malignant", "benign"],
feature_names=cancer.feature_names, impurity=False, filled=True)

let's see our tree

In [None]:
import graphviz
with open("tree.dot") as f:
dot_graph = f.read()
graphviz.Source(dot_graph)


Feature importance:Which feature is important in the tree's decision making?

In [None]:
print("Feature importances:\n{}".format(tree.feature_importances_))

In [None]:
def plot_feature_importances_cancer(model):
n_features = cancer.data.shape[1]
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), cancer.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plot_feature_importances_cancer(tree)

# Random forests: an example

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,
random_state=42)
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

In [None]:
plot_feature_importances_cancer(forest)

In general, it’s a good rule of thumb to use
the default values: max_features=sqrt(n_features) for classification and max_fea
tures=log2(n_features) for regression.

# Gradient boosting: an example

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0)
gbrt = GradientBoostingClassifier(random_state=0, learning_rate=0.01, max_depth = 1)
gbrt.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))

In [None]:
plot_feature_importances_cancer(gbrt)