# Part 1

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.datasets import make_moons
from sklearn.model_selection import GridSearchCV

In [2]:
X, y = make_moons(n_samples=10_000, noise=0.4, random_state=6)

In [3]:
X.shape

(10000, 2)

In [4]:
y.shape

(10000,)

In [5]:
parameters = {'max_leaf_nodes':range(2,50)}
clf = GridSearchCV(DecisionTreeClassifier(), parameters, n_jobs=4)
clf.fit(X=X, y=y)
tree_model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)

0.8555999999999999 {'max_leaf_nodes': 23}


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [7]:
clf = DecisionTreeClassifier(max_leaf_nodes=23)
clf.fit(X_train, y_train)

In [8]:
y_pred = clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8515


# Part 2

In [9]:
from sklearn.model_selection import ShuffleSplit
from scipy import stats

In [10]:
shuffle = ShuffleSplit(n_splits=1_000, test_size=.01, train_size=0.01, random_state=6)

In [11]:
shuffle.get_n_splits(X_train)
print(shuffle)

ShuffleSplit(n_splits=1000, random_state=6, test_size=0.01, train_size=0.01)


In [12]:
X_subsets = dict()
y_subsets = dict()

In [13]:
for i, indexes in enumerate(shuffle.split(X_train)):
    X_subsets[i] = X_train[indexes[0], :]
    y_subsets[i] = y_train[indexes[0]]

In [14]:
clf = DecisionTreeClassifier(max_leaf_nodes=23, random_state=6)
scores = []
for subset in range(0, 1000):
    clf.fit(X_subsets[subset], y_subsets[subset])
    y_pred = clf.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))

In [15]:
print("Max accuracy:",format(max(scores)))
print("Min accuracy:",(min(scores)))

Max accuracy: 0.8505
Min accuracy: 0.672


In [None]:
y_pred = []
for instance in range(0, X_test.shape[0]):
    predictions = []
    for subset in range(0, 1000):
        clf.fit(X_subsets[subset], y_subsets[subset])
        predictions.append(clf.predict(X_test[instance].reshape(-1, 2)))
    y_pred.append(stats.mode(np.array(predictions)))

print("Accuracy on test set:",(metrics.accuracy_score(y_test, y_pred)))