# 7.2: Decision Trees

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier

## Warm-Up Task

* Load the 18 entries of shirt sizes into a DataFrame and min-max it

In [6]:
df = pd.read_csv('shirt_sizes_long.csv')

X_train = df.drop("t-shirt size", axis=1) # 1 is for columns
print('training dataset:\n', X_train)

y_train = df["t-shirt size"]
print('\ntraining y-vector:\n', y_train)

X_test = [[161, 63]]
print('\ntest data:\n', X_test)


scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_normalized = scaler.transform(X_train)
print('Normalized training data:\n', X_train_normalized)
X_test_normalized = scaler.transform(X_test)
print('\nNormalized test data:\n', X_test_normalized)


knn_clf = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_clf.fit(X_train_normalized, y_train)


y_predicted = knn_clf.predict(X_test_normalized)
print('y predicted:', y_predicted)
print('nearest neighbors:', knn_clf.kneighbors(X_test_normalized))

training dataset:
     height(cm)  weight(kg)
0          158          58
1          158          59
2          158          63
3          160          59
4          160          60
5          163          60
6          163          61
7          160          64
8          163          64
9          165          61
10         165          62
11         165          65
12         168          62
13         168          63
14         168          66
15         170          63
16         170          64
17         170          68

training y-vector:
 0     M
1     M
2     M
3     M
4     M
5     M
6     M
7     L
8     L
9     L
10    L
11    L
12    L
13    L
14    L
15    L
16    L
17    L
Name: t-shirt size, dtype: object

test data:
 [[161, 63]]
Normalized training data:
 [[0.         0.        ]
 [0.         0.1       ]
 [0.         0.5       ]
 [0.16666667 0.1       ]
 [0.16666667 0.2       ]
 [0.41666667 0.2       ]
 [0.41666667 0.3       ]
 [0.16666667 0.6       ]
 [0.41666667 0.6 

## Classifier Evaluation

* In our previous demo, we had 1 instance in our "test set"
    * If our classifier predicted this instance's class correctly, accuuracy 100%
    * If our classifier predicted this instance's class incorrectly, accuracy 0%
* Notes
    * We should use a large test set to get a better picture
    * Accuracy doesn't tell the whole story...
        * E.g. 100 samples... 99 M, 1 L
        * Classifier only predicts M
        * We have 99% accuracy woohoo!!!
        * **ACCURACY ONLY MAKES SENSE WHEN YOUR CLASS LABELS ARE NEAR EVENLY DISTRIBUTED**
* Given a dataset, we need a way to "divide" our dataset into a training set and a test set
    * A few ways to do this...
        1. Hold out method
        1. Random subsampling
        1. Cross validation
        1. Bootstrap method

## Holdout Method

* "hold out" a certain number or percentage of instances in a dataset for testing
   * Train on the remaining instances
    * Typically choose a standard split or percentage
        * E.G.: With a 2:1 split you have 1/3 data for testing and 2/3 data for training
        * E.G.: 25% held out for testing, so remaining 75% is used for training
            * Default for sklearn's `train_test_split()`

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df.drop("t-shirt size", axis=1)
y = df["t-shirt size"]

# random state used for reproducability
X_train, X_test, y_train, y_test = train_test_split(df.drop("t-shirt size", axis=1), df["t-shirt size"], random_state=0, stratify=df["t-shirt size"])
# print(X_train)
# print(X_test)
# print(y_train)
# print(y_test)

knn_clf = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn_clf.fit(X_train, y_train)
y_predicted = knn_clf.predict(X_test)
print(y_predicted)
print(list(y_test))
print(accuracy_score(y_test, y_predicted))
print(knn_clf.score(X_test, y_test))

['L' 'L' 'M' 'M' 'L']
['L', 'M', 'L', 'M', 'L']
0.6
0.6


In [83]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
y_predicted = tree_clf.predict(X_test)
print(y_predicted)
print(list(y_test))
print(tree_clf.score(X_test, y_test))

['L' 'M' 'L' 'M' 'L']
['L', 'M', 'L', 'M', 'L']
1.0


## Random Subsampling

Perform the hold out method $k$ times (different $k$ from kNN). Your accuracy is the mean accuracy over the $k$ runs

## Cross Validation

* With random subsampling, we are not guaranteed that each isntance ends up in a test set at least once
* With cross validation, we are more intentional about our "partitions"
* Algorithm:
    * Divide the dataset into $k$ folds (yet another different $k$)
    * For each fold:
        * Hold out the fold and test on it
        * Train on the remaining folds (folds - fold)
* With this algorithm, each instance is tested exactly 1 time
* Accuracy = total predicted correctly / total predicted

In [86]:
from sklearn.model_selection import cross_val_score, cross_val_predict

# run 5-fold cross validation for both the knn and tree
for clf in [knn_clf, tree_clf]:
    print(type(clf))

    # lazy approach
    accuracies = cross_val_score(clf, X, y, cv=5)
    print(accuracies)

    # better approach
    y_predicted = cross_val_predict(clf, X, y, cv=5)
    print(y_predicted)
    accuracy = accuracy_score(y, y_predicted)
    print(accuracy)

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
[0.75       0.5        1.         0.66666667 0.66666667]
['M' 'M' 'M' 'M' 'M' 'L' 'L' 'M' 'L' 'M' 'M' 'L' 'L' 'L' 'L' 'L' 'L' 'L']
0.7222222222222222
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
[0.5        0.5        1.         1.         0.66666667]
['M' 'M' 'L' 'M' 'M' 'M' 'L' 'M' 'M' 'M' 'L' 'L' 'L' 'L' 'L' 'L' 'L' 'L']
0.7222222222222222
