<a href="https://colab.research.google.com/github/Richish/hands_on_ml/blob/master/6_decision_tree_ex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 7. Train and fine-tune a Decision Tree for the moons dataset.
1. Generate a moons dataset using make_moons(n_samples=10000, noise=0.4).
2. Split it into a training set and a test set using train_test_split().
3. Use grid search with cross-validation (with the help of the GridSearchCV
class) to find good hyperparameter values for a DecisionTreeClassifier.
Hint: try various values for max_leaf_nodes.
4. Train it on the full training set using these hyperparameters, and measure
your model’s performance on the test set. You should get roughly 85% to 87%
accuracy.

In [6]:
#generating moons
from sklearn.datasets import make_moons

moons_data = make_moons(n_samples=10_000, shuffle=True, noise=0.4, random_state=42)
moons_data

(array([[ 0.9402914 ,  0.12230559],
        [ 0.12454026, -0.42477546],
        [ 0.26198823,  0.50841438],
        ...,
        [-0.24177973,  0.20957199],
        [ 0.90679645,  0.54958215],
        [ 2.08837082, -0.05050728]]), array([1, 0, 0, ..., 1, 0, 1]))

In [10]:
# test train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(moons_data[0], moons_data[1], test_size=0.2, shuffle=True, random_state=42)
len(X_train), X_test, len(y_train), y_test

(8000, array([[ 0.69945888, -0.8734481 ],
        [ 1.7764418 ,  0.13222334],
        [-1.14450821,  0.24446319],
        ...,
        [ 0.66336269,  0.79833307],
        [-0.6493245 ,  1.19920859],
        [-0.09883144,  0.40961263]]), 8000, array([1, 1, 0, ..., 0, 0, 0]))

In [39]:
# basic decision tree classifier - training and test
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=8)
tree_clf.fit(X_train, y_train)
y_pred = tree_clf.predict(X_test)


In [40]:
# accuracy on basic implementation
from sklearn import metrics
metrics.accuracy_score(y_true=y_test, y_pred=y_pred)

0.86

In [56]:
# grid search
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score)}

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid={'max_leaf_nodes':[2,3,4,5,6,7,8,16,32,64]}, scoring=scoring, refit='accuracy')
grid.fit(X=X_train, y=y_train)
sorted(grid.cv_results_.keys())
grid.cv_results_['params'], grid.cv_results_['mean_test_accuracy'], grid.cv_results_['mean_test_precision'], grid.cv_results_['mean_test_recall']

([{'max_leaf_nodes': 2},
  {'max_leaf_nodes': 3},
  {'max_leaf_nodes': 4},
  {'max_leaf_nodes': 5},
  {'max_leaf_nodes': 6},
  {'max_leaf_nodes': 7},
  {'max_leaf_nodes': 8},
  {'max_leaf_nodes': 16},
  {'max_leaf_nodes': 32},
  {'max_leaf_nodes': 64}],
 array([0.77    , 0.817125, 0.853   , 0.853   , 0.853   , 0.853   ,
        0.853   , 0.852125, 0.85575 , 0.849625]),
 array([0.75793329, 0.82737975, 0.84261037, 0.84261037, 0.84261037,
        0.84261037, 0.84261037, 0.86348752, 0.85477429, 0.85434341]),
 array([0.79540066, 0.80613504, 0.86941706, 0.86941706, 0.86941706,
        0.86941706, 0.86941706, 0.8375304 , 0.85845691, 0.84450549]))

In [57]:
# training on full training set using best params found in grid search and evaluating on test set
 
best_tree_clf = DecisionTreeClassifier(max_leaf_nodes=5)
best_tree_clf.fit(X_train, y_train)
best_y_pred = best_tree_clf.predict(X_test)
from sklearn import metrics
metrics.accuracy_score(y_true=y_test, y_pred=best_y_pred)



0.863