<font color = green >

# Decision Trees
</font>

In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn import tree
import graphviz
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
X, y, labels, features = cancer.data, cancer.target, cancer.target_names, cancer.feature_names
print ('labels:',labels)
print ('features:', features)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

labels: ['malignant' 'benign']
features: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


<font color = green>

##  DecisionTree 

</font>



In [10]:
# YOUR_CODE.  create the classifier and train it 
# START_CODE
best_score = 0
for depth in np.arange(1,11,2):
    clf = DecisionTreeClassifier(
        criterion='entropy',
        random_state=10,        
        max_depth=depth,    
    ).fit(X_train, y_train)
    if clf.score(X_test, y_test) > best_score:
        best_score = clf.score(X_test, y_test)
        best_depth = depth
        train_accuracy = clf.score(X_train, y_train)
    print(f'Depth = {depth}')
    print("Train accuracy= {:.3%}".format(clf.score(X_train, y_train)))
    print("Test accuracy= {:.3%}".format(clf.score(X_test, y_test)))
    print('---------------')
# END_CODE 

print('\nResult:')
print('Best depth parameter = {}'.format(best_depth))
print("Train accuracy= {:.3%}".format(train_accuracy))
print("Test accuracy= {:.3%}".format(best_score))
# graph_viz = tree.export_graphviz(clf, out_file=None,feature_names=features, class_names=labels, filled = True)
# graph = graphviz.Source(graph_viz)
# graph.view(cleanup =True) # cleanup (bool) – Delete the source file after rendering.

Depth = 1
Train accuracy= 92.254%
Test accuracy= 88.112%
---------------
Depth = 3
Train accuracy= 97.653%
Test accuracy= 95.105%
---------------
Depth = 5
Train accuracy= 99.296%
Test accuracy= 96.503%
---------------
Depth = 7
Train accuracy= 100.000%
Test accuracy= 95.105%
---------------
Depth = 9
Train accuracy= 100.000%
Test accuracy= 95.105%
---------------

Result:
Best depth parameter = 5
Train accuracy= 99.296%
Test accuracy= 96.503%


<font color = green>

##  RandomForest

</font>



In [11]:
# YOUR_CODE.  create the classifier and train it 
# START_CODE
best_test_score = 0
for i in np.arange(1,11):   
    for j in np.arange(2,11,2): 
        clf = RandomForestClassifier(
            random_state=10,
            n_estimators=j,
            max_features=i,
        ).fit(X_train, y_train)
        if clf.score(X_test, y_test) > best_test_score:
            best_train_score = clf.score(X_train, y_train)
            best_test_score = clf.score(X_test, y_test)
            best_n_estimator = j
            best_max_features = i

print('The best result with parameters:')
print(f'n_estimator = {best_n_estimator}, max_features = {best_max_features}')
print('Train accuracy = {:.3%}'.format(best_train_score))
print('Test accuracy = {:.3%}'.format(best_test_score))

The best result with parameters:
n_estimator = 8, max_features = 5
Train accuracy = 99.531%
Test accuracy = 98.601%


<font color = green>

##  Gradient Boosting Decision Trees (GBDT)

</font>



In [12]:
# YOUR_CODE.  create the classifier and train it 
# START_CODE
test_score = 0
for learn_rate in np.arange(0.01, 0.1, 0.01):
    for depth in np.arange(1,11):  
        clf = GradientBoostingClassifier(
            learning_rate=learn_rate,
            max_depth=depth
        ).fit(X_train, y_train)
        if clf.score(X_test, y_test) > test_score:
            train_score = clf.score(X_train, y_train)
            test_score = clf.score(X_test, y_test)
            learning_rate = learn_rate
            best_depth = depth
            best_clf = clf
        
# END_CODE 
print('The best result with parameters:')
print(f'lerning_rate = {learning_rate}, max_depth = {best_depth}')
print("train accuracy= {:.3%}".format(train_score))
print("test accuracy= {:.3%}".format(test_score))

The best result with parameters:
lerning_rate = 0.060000000000000005, max_depth = 2
train accuracy= 99.765%
test accuracy= 97.203%


<font color = green>

##  XGBoost

</font>



In [13]:
# YOUR_CODE.  create the classifier and train it 
# START_CODE 
clf = XGBClassifier(use_label_encoder=False, eval_metric= 'logloss')
clf.fit(X_train, y_train)
# END_CODE 

print("train accuracy= {:.3%}".format(clf.score (X_train, y_train)))
print("test accuracy= {:.3%}".format(clf.score (X_test, y_test)))

train accuracy= 100.000%
test accuracy= 98.601%
