# Decision Trees in scikit-learn
Using the `DecisionTreeClassifier` in scikit-learn.  

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
apears = pd.read_csv('ApplesPears.csv')
apears.head()

scikit-learn can deal with a category class label but it cannot deal with category features.  
So we drop the `Taste` feature. 

In [None]:
y = apears.pop('Class').values
apears.pop('Taste')    # Can't deal with category features
ap_features = list(apears.columns)
X = apears.values
X[0]

In [None]:
type(X)

In [None]:
y

In [None]:
apears

In [None]:
ftree = DecisionTreeClassifier(criterion='entropy')
ap_tree = ftree.fit(apears, y)

In [None]:
help(ftree)

In [None]:
ap_tree.predict([X[2]])

In [None]:
tree.plot_tree(ap_tree, feature_names=ap_features, fontsize = 12,
                      class_names=['Apple','Pear'],  
                      filled=True, rounded=True) 
None # supressing the verbose return from plot_tree

In [None]:
apears.pop('H/W')    # Delete this feature to make it harder
X = apears.values
ap_features = list(apears.columns)

In [None]:
ap2_tree = ftree.fit(X, y)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
tree.plot_tree(ap2_tree,feature_names=ap_features, fontsize = 12,
                      class_names=['Apple','Pear'],  
                      filled=True, rounded=True) 
None # supressing the verbose return from plot_tree

---
## Penguins Data
For more information on the Penguins dataset see:
https://allisonhorst.github.io/palmerpenguins/ 

Changing the `min_samples_leaf` and `min_impurity_decrease` attributes will change the *bushiness* of the tree. 
Two key methods:
1. `fit` method will train the tree from the data.
2. `predict` method will produce class predictions for an array of test data. 

In [None]:
penguins_all = pd.read_csv('penguins_af.csv')
f_names = ['bill_length_mm', 'bill_depth_mm','flipper_length_mm', 'body_mass_g']
X = penguins_all[f_names].values
y = penguins_all['species']
species_names = list(np.unique(y))
species_names

In [None]:
X.shape

In [None]:
ptree = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=30, 
                               min_impurity_decrease = 0.1)
ptree.fit(X,y)

## ptree # Have a look at the model attributes

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
tree.plot_tree(ptree, feature_names=f_names,  
                      class_names=species_names, fontsize = 9,
                      filled=True, rounded=True) 
None # supressing the verbose return from plot_tree

Run a test example.

In [None]:
tn = 15
for i,j in zip(X[tn],f_names):
    print(i,j)
y_pred = ptree.predict([X[tn]])
print('Prediced class label:',y_pred[0])

***
   ## Athlete Data

In [None]:
import pandas as pd
athlete = pd.read_csv('AthleteSelection.csv',index_col = 'Athlete')
athlete.head()

In [None]:
y = athlete.pop('Selected').values
X = athlete.values

In [None]:
atree = DecisionTreeClassifier(criterion='gini')
atree = atree.fit(X,y)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
tree.plot_tree(atree, feature_names=['Speed','Agility'],  
                      class_names=['Selected','No'],  
                      filled=True, rounded=True,)
None # supressing the verbose return from plot_tree

## Restaurant Data
Predictive features are categories(rather than numeric).

In [None]:
import pandas as pd
restaurant = pd.read_csv('restaurant.csv',index_col = 'No')
restaurant.head()

## Aside: Dealing with category data
Convert to numeric - two options:  
1. `get_dummies` method for pandas.
2. `OneHotEncoding` for sklearn. 

In [None]:
df = pd.DataFrame({'Pet': ['cat', 'dog', 'cat','ferret'], 
                   'Transport': ['bike', 'car', 'car','bike'],
                   'Area': ['urban','urban','rural','urban']})
df

### Pandas `get_dummies`
The Pandas `get_dummies` method is the easiest way to do One-Hot encoding.  
But if you want to apply the encoding to a test file later, this gets awkward. 

In [None]:
pd.get_dummies(df)

In [None]:
pd.get_dummies(df,drop_first=True)

### Using `OneHotEncoder` to convert category features to numbers

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
onehot_encoder = OneHotEncoder(sparse_output=False)
dfOH = onehot_encoder.fit_transform(df)
dfOH

In [None]:
onehot_encoder.get_feature_names_out()

In [None]:
onehot_encoder.categories_

### `LabelEncoder` also converts category features to numbers
This is more compact.  
But it is not exactly what we want as the numbers are misleading.  
Ferrets are not more like dogs than cats. (Well maybe they are!)

In [None]:
# LabelEncoder only works on single columns so we must 'apply' it to the dataframe. 
label_encoder = LabelEncoder()
labelE = df.apply(label_encoder.fit_transform)
labelE

---
# Restaurant Data 
## Using OneHotEncoding
`OneHotEncoder` class has two key methods:   
1. `fit` to 'learn' the transform from the data,
2. `transform` to apply the OneHot transform to the data, the transform can be applied to other (e.g. test) datasets.


In [None]:
restaurant = pd.read_csv('restaurant.csv',index_col = 'No')
y = restaurant.pop('WillWait?').values
X = restaurant.values
X[:3,]

In [None]:
onehot_encoder = OneHotEncoder(sparse_output=False)  # We can add drop='first'
restOH = onehot_encoder.fit(restaurant)
restOH_data = restOH.transform(restaurant)

In [None]:
restaurant.columns

In [None]:
restOH.get_feature_names_out(restaurant.columns)

In [None]:
rtree = DecisionTreeClassifier(criterion='entropy')
rtreeOH = rtree.fit(restOH_data,y)
fig, ax = plt.subplots(figsize=(9, 9))
tree.plot_tree(rtreeOH, 
               feature_names=list(restOH.get_feature_names_out(restaurant.columns)),
               class_names=['Yes','No'], fontsize = 10, 
               filled=True, rounded=True)
None # supressing the verbose return from plot_tree

---
<h1><span style="color:red">Bonus Material</span></h1>



## Encoding Restaurant data using `get_dummies`

In [None]:
rest1 = pd.get_dummies(restaurant,drop_first=False)
rest2 = pd.get_dummies(restaurant,drop_first=True)

In [None]:
rest1.head()

In [None]:
rest2.head()

In [None]:
X = rest1.values

In [None]:
rtree1 = DecisionTreeClassifier(criterion='entropy')
rtree1.fit(rest1,y) # fit method can be called directly on the data frame
rtree1.fit(X,y)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
tree.plot_tree(rtree1, feature_names=list(rest1.columns),
                      class_names=['Yes','No'], fontsize = 9,
                      filled=True, rounded=True)
None # supressing the verbose return from plot_tree

In [None]:
rtree2 = DecisionTreeClassifier(criterion='entropy')
X2 = rest2.values
rtree2.fit(X2,y)
fig, ax = plt.subplots(figsize=(8, 8))
tree.plot_tree(rtree2, feature_names=list(rest2.columns),
                      class_names=['Yes','No'], fontsize = 9,
                      filled=True, rounded=True)
None # supressing the verbose return from plot_tree