In [1]:
# Sample Decision Tree Classifier
from sklearn import datasets
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

In [2]:
# load the iris datasets
dataset = datasets.load_iris()

In [3]:
from sklearn.tree import export_graphviz
#import graphviz

In [4]:
def run_gridsearch(X, y, clf, param_grid, cv=5):
    """Run a grid search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_grid -- [dict] parameter settings to test
    cv -- fold of cross-validation, default 5

    Returns
    -------
    top_params -- [dict] from report()
    """
    grid_search = GridSearchCV(clf,
                               param_grid=param_grid,
                               cv=cv)
    start = time()
    grid_search.fit(X, y)

    print(("\nGridSearchCV took {:.2f} "
           "seconds for {:d} candidate "
           "parameter settings.").format(time() - start,
                len(grid_search.grid_scores_)))

    top_params = report(grid_search.grid_scores_, 3)
    return  top_params

In [5]:
def report(grid_scores, n_top=3):
    """Report top n_top parameters settings, default n_top=3.

    Args
    ----
    grid_scores -- output from grid or random search
    n_top -- how many to report, of top models

    Returns
    -------
    top_params -- [dict] top parameter settings found in
                  search
    """
    top_scores = sorted(grid_scores,
                        key=itemgetter(1),
                        reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print(("Mean validation score: "
               "{0:.3f} (std: {1:.3f})").format(
               score.mean_validation_score,
               np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

    return top_scores[0].parameters

In [6]:
def encode_target(df, target_column):
    """Add column to df with integers for the target.

    Args
    ----
    df -- pandas Data Frame.
    target_column -- column to map to int, producing new
                     Target column.

    Returns
    -------
    df -- modified Data Frame.
    targets -- list of target names.
    """
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

In [7]:
import pandas as pd
import numpy as np

In [8]:
import os

In [9]:
print("\n-- get data:")
df = pd.read_csv('/home/shenbaga/data-files/iris.data', \
                names = ['sepal_length','sepal_width','petal_length','petal_width','class'])


-- get data:


In [10]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [11]:
features = ['sepal_length','sepal_width','petal_length','petal_width']
df, targets = encode_target(df, "class")
y = df["Target"]
X = df[features]

In [12]:
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import  cross_val_score

In [13]:
from time import time

In [14]:
from operator import itemgetter
from scipy.stats import randint

In [15]:
print("-- Grid Parameter Search via 10-fold CV")

# set of parameters to test
param_grid = {"criterion": ["gini", "entropy"],
              "min_samples_split": [2, 10, 20],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              }

dt = DecisionTreeClassifier()
ts_gs = run_gridsearch(X, y, dt, param_grid, cv=10)

-- Grid Parameter Search via 10-fold CV

GridSearchCV took 17.47 seconds for 288 candidate parameter settings.
Model with rank: 1
Mean validation score: 0.967 (std: 0.033)
Parameters: {'min_samples_split': 10, 'max_leaf_nodes': 5, 'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1}

Model with rank: 2
Mean validation score: 0.967 (std: 0.033)
Parameters: {'min_samples_split': 20, 'max_leaf_nodes': 5, 'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 1}

Model with rank: 3
Mean validation score: 0.967 (std: 0.033)
Parameters: {'min_samples_split': 10, 'max_leaf_nodes': 5, 'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 1}



In [16]:
def run_randomsearch(X, y, clf, para_dist, cv=5, n_iter_search=20):
    """Run a random search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_dist -- [dict] list, distributions of parameters
                  to sample
    cv -- fold of cross-validation, default 5
    n_iter_search -- number of random parameter sets to try,
                     default 20.

    Returns
    -------
    top_params -- [dict] from report()
    """
    random_search = RandomizedSearchCV(clf,
                        param_distributions=param_dist,
                        n_iter=n_iter_search)

    start = time()
    random_search.fit(X, y)
    print(("\nRandomizedSearchCV took {:.2f} seconds "
           "for {:d} candidates parameter "
           "settings.").format((time() - start),
                               n_iter_search))

    top_params = report(random_search.grid_scores_, 3)
    return  top_params

In [17]:
print("-- 10-fold cross-validation "
      "[using setup from previous post]")
dt_old = DecisionTreeClassifier(min_samples_split=20,
                                random_state=99)
dt_old.fit(X, y)
scores = cross_val_score(dt_old, X, y, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()))

-- 10-fold cross-validation [using setup from previous post]
mean: 0.960 (std: 0.033)


In [18]:
type(scores)

numpy.ndarray

In [19]:
scores

array([ 1.        ,  0.93333333,  1.        ,  0.93333333,  0.93333333,
        0.93333333,  0.93333333,  0.93333333,  1.        ,  1.        ])

In [20]:
type(dt_old)

sklearn.tree.tree.DecisionTreeClassifier

In [24]:
dt_old.score

<bound method DecisionTreeClassifier.score of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            presort=False, random_state=99, splitter='best')>

In [25]:
dt_old.score(X,y)

0.97999999999999998

In [37]:
print("\n-- Best Parameters:")
for k, v in ts_gs.items():
    print("parameter: {:<20s} setting: {}".format(k, v))


-- Best Parameters:
parameter: min_samples_split    setting: 10
parameter: max_leaf_nodes       setting: 5
parameter: criterion            setting: gini
parameter: max_depth            setting: None
parameter: min_samples_leaf     setting: 1


In [40]:
# test the retuned best parameters
print("\n\n-- Testing best parameters [Grid]...")
dt_ts_gs = DecisionTreeClassifier(**ts_gs)
scores = cross_val_score(dt_ts_gs, X, y, cv=10)
print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),
                                          scores.std()))



-- Testing best parameters [Grid]...
mean: 0.967 (std: 0.033)


In [38]:
print("-- Random Parameter Search via 10-fold CV")

# dict of parameter list/distributions to sample
param_dist = {"criterion": ["gini", "entropy"],
              "min_samples_split": randint(1, 20),
              "max_depth": randint(1, 20),
              "min_samples_leaf": randint(1, 20),
              "max_leaf_nodes": randint(2, 20)}

dt = DecisionTreeClassifier()
ts_rs = run_randomsearch(X, y, dt, param_dist, cv=10,
                         n_iter_search=288)

-- Random Parameter Search via 10-fold CV

RandomizedSearchCV took 4.11 seconds for 288 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.960 (std: 0.042)
Parameters: {'min_samples_split': 18, 'max_leaf_nodes': 10, 'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 3}

Model with rank: 2
Mean validation score: 0.960 (std: 0.042)
Parameters: {'min_samples_split': 14, 'max_leaf_nodes': 16, 'criterion': 'entropy', 'max_depth': 16, 'min_samples_leaf': 4}

Model with rank: 3
Mean validation score: 0.960 (std: 0.042)
Parameters: {'min_samples_split': 7, 'max_leaf_nodes': 8, 'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 5}



In [9]:
# make predictions
expected = dataset.target
predicted = model.predict(dataset.data)

In [10]:
type(expected), type(predicted)

(numpy.ndarray, numpy.ndarray)

In [11]:
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        50
          1       1.00      1.00      1.00        50
          2       1.00      1.00      1.00        50

avg / total       1.00      1.00      1.00       150



In [12]:
print(metrics.confusion_matrix(expected, predicted))

[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]


In [96]:
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

In [97]:
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [123]:
with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(clf, feature_names=features, class_names=['0','1','2'], out_file=f)

In [124]:
import subprocess
subprocess.call(['dot', '-Tpdf', 'iris.dot', '-o' 'tree3.pdf'])

0

In [127]:
#a method to get the tree as pdf

from sklearn.datasets import load_iris
#from sklearn import tree
#iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)

with open("iris.dot", 'w') as f:
    f = tree.export_graphviz(clf, feature_names=features, out_file=f,class_names=iris.target_names)

import subprocess
subprocess.call(['dot', '-Tpdf', 'iris.dot', '-o' 'tree5.pdf'])

0

In [145]:
from sklearn.cross_validation import  cross_val_score

In [146]:
y = train["Target"]
X = train[features]

In [149]:
print("-- 10-fold cross-validation "
      "[using setup from previous post]")
dt_old = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt_old.fit(X, y)
scores = cross_val_score(dt_old, X, y, cv=10)
print scores.mean()
print scores.std()
#print("mean: {:.3f} (std: {:.3f})".format(scores.mean(),scores.std()),end="\n\n" )

-- 10-fold cross-validation [using setup from previous post]
0.965151515152
0.0589937040422
