In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:]
y = iris.target

tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X,y)

In [3]:
from sklearn.tree import export_graphviz

export_graphviz(
    tree_clf,
    out_file="iris_tree.dot",
    feature_names=iris.feature_names[2:],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [19]:
tree_clf.predict_proba([[5,1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [20]:
tree_clf.classes_

array([0, 1, 2])

In [None]:
# Exercise

In [21]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X,y = make_moons(n_samples=10000,noise=0.4)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)


In [66]:
from sklearn.model_selection import GridSearchCV

tree_clf = DecisionTreeClassifier()
param_grid = [
    {'max_leaf_nodes':[2,4,8,10,20,30],'min_samples_leaf':[4,6,8],'max_depth':[4,8,10]},
    
]


grid_search = GridSearchCV(tree_clf,param_grid,cv=10,
                           scoring = 'neg_mean_squared_error',
                           return_train_score= True)

grid_search.fit(X_train,y_train)


In [67]:
grid_search.best_params_

{'max_depth': 4, 'max_leaf_nodes': 4, 'min_samples_leaf': 4}

In [68]:
grid_search.best_score_

-0.14675

In [74]:
from sklearn.model_selection import cross_val_score
import numpy as np
scores = cross_val_score(grid_search, X_train, y_train,scoring="f1", cv=5)
train_score = scores

In [78]:
train_score.mean()

0.8543281054177516

In [76]:
scores = cross_val_score(grid_search, X_val, y_val,scoring="f1", cv=5)
test_score = scores

In [77]:
test_score.mean()

0.8454046529884588

In [79]:
# Exercise

In [80]:
from sklearn.model_selection import ShuffleSplit

In [99]:
rs = ShuffleSplit(n_splits=1000,test_size=100,random_state=72)
rs.get_n_splits(X_train)

1000

In [103]:
sub_indices = np.array([train_index for _, train_index in rs.split(X_train)])

In [107]:
from sklearn.metrics import accuracy_score
from scipy.stats import mode

accuracies = []
predictions = []

for subset_indices in sub_indices:
    X_subset = X_train[subset_indices]
    y_subset = y_train[subset_indices]
    
    # Initialize the Decision Tree with the best hyperparameters
    tree_clf = DecisionTreeClassifier(
        max_depth=grid_search.best_params_['max_depth'],
        min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
        max_leaf_nodes=grid_search.best_params_['max_leaf_nodes']
    )
    
    # Train the tree on the current subset
    tree_clf.fit(X_subset, y_subset)
    
    # Predict on the test set
    y_pred = tree_clf.predict(X_val)
    predictions.append(y_pred)
    
    # Evaluate the accuracy on the test set
    accuracy = accuracy_score(y_val, y_pred)
    accuracies.append(accuracy)



mean_accuracy = np.mean(accuracies)
print(f'Mean accuracy over 1,000 Decision Trees: {mean_accuracy:.4f}')
predictions = np.array(predictions)
majority_vote_predictions = mode(predictions, axis=0)[0].flatten()

accuracy = accuracy_score(y_val, majority_vote_predictions)
print(f'Majority vote accuracy: {accuracy:.4f}')

Mean accuracy over 1,000 Decision Trees: 0.8170
Majority vote accuracy: 0.8540
