# Machine Learning Practice
## Module 11: Decision Trees
## Tree learning demonstration

Andrew H. Fagg (andrewhfagg@gmail.com)

Important Notes:
- This skeleton deviates some from what is presented in video
- New: I have added some new cells where we learn/test using the entire data set.  This allows us to see what is possible if we have enough data.
- UPDATED: cross_val_predict() overfits the data really easy.  Use cv=40 to cv=100 to have enough data in the training set to produce generalizable models
- NOTE: In the video, I suggest that proba[:,0] corresponds to the positive class probabilities.  This is not true.  proba[:,1] are the positive class probabilities.  

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from IPython import display

#############################3
# Default figure parameters
plt.rcParams['figure.figsize'] = (8,4)
plt.rcParams['font.size'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.constrained_layout.use'] = True
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 14



In [None]:
# Mount Google Drive
from google.colab import drivetorque
drive.mount('/content/drive')

In [None]:
# From book
# Pipeline component: select subsets of attributes
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribs):
        self.attribs = attribs
    def fit(self, x, y=None):
        return self
    def transform(self, X):
        return X[self.attribs].values

# Pipeline component: New transformer class: drop all rows that contain invalid values
class DataSampleDropper(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, X):
        return X.dropna(how = 'any')

# Pipeline component: Compute derivatives
class ComputeDerivative(BaseEstimator, TransformerMixin):
    def __init__(self, attribs, dt=1.0, prefix='d_'):
        self.attribs = attribs
        self.dt = dt
        self.prefix = prefix
    def fit(self, x, y=None):
        return self
    def transform(self, X):
        # Compute derivatives
        Xout = X.copy()
        for field in self.attribs:
            # Extract the values for this field
            values = Xout[field].values
            # Compute the difference between subsequent values
            diff = values[1:] - values[0:-1]
            # Bring the length to be the same as original data
            np.append(diff, 0)
            # Name of the new field
            name = self.prefix + field
            # 20 ms time step
            Xout[name] = pd.Series(diff / self.dt)
        return Xout

In [None]:
## Support for identifying kinematic variable columns
def get_kinematic_properties(data):
    # Regular expression for finding kinematic fields
    px = re.compile("_[xyz]$")

    # Find the list of kinematic fields
    fields = list(data)
    fieldsKin = [x for x in fields if px.search(x)]
    return fieldsKin

def position_fields_to_position_and_velocity_fields(fields, prefix='d_'):
    '''
    Given a list of position columns, produce a new list
    of columns that include both position and velocity
    '''
    fields_new = [prefix + x for x in fields]
    return fields + fields_new


## Load and organize data

In [None]:
# Note: you may need to change this path to get to the data
#fname = '/home/fagg/datasets/baby1/subject_k2_w10.csv'
fname = '/content/drive/MyDrive/MLP_2022/datasets/baby1/subject_k2_w10.csv'

# Load the data
infant_data = pd.read_csv(fname)

In [None]:
# Extract time / actions
time = infant_data['time'].values
action = infant_data['sippc_action'].values

In [None]:
# Names of the kinematic variables
fieldsKin = get_kinematic_properties(infant_data)
fieldsKinVel = position_fields_to_position_and_velocity_fields(fieldsKin)

## Create Pipelines

In [None]:
prepipe = Pipeline([
    ('derivative', ComputeDerivative(fieldsKin, dt=.02)),
    ('dropper', DataSampleDropper())
])

# Position pipe
pipe_pos = Pipeline([('selector', DataFrameSelector(fieldsKin))])

# Position + velocity selector
pipe_pos_vel = Pipeline([('selector', DataFrameSelector(fieldsKinVel))])

# Robot action
attribs_label = ['sippc_action']
pipe_label = Pipeline([('selector', DataFrameSelector(attribs_label))])

# Time
attribs_time = ['time']
pipe_time = Pipeline([('selector', DataFrameSelector(attribs_time))])


In [None]:
# Use the pipelines 
infant_data2 = prepipe.fit_transform(infant_data)

# Selection
inputs_pos = pipe_pos.transform(infant_data2)
inputs_pos_vel = pipe_pos_vel.transform(infant_data2)
action = pipe_label.transform(infant_data2).reshape((-1,))
time = pipe_time.transform(infant_data2)

## Create Class Labels

In [None]:
label_motion = action > 0

# Action onset (any action)

label_assistance_onset_any = (action[0:-1] == 0) & (action[1:] > 0) & (action[1:] <= 8)
label_assistance_onset_any = np.append(label_assistance_onset_any, 0)

# Action onset: power steering
label_assistance_onset_ps = (action[0:-1] == 0) & (action[1:] > 0) & (action[1:] <= 4)
label_assistance_onset_ps = np.append(label_assistance_onset_ps, 0)

# Action onset: gesture
label_assistance_onset_g = (action[0:-1] == 0) & (action[1:] >= 5) & (action[1:] <= 8)
label_assistance_onset_g = np.append(label_assistance_onset_g, 0)


## Filter Data Set
Our goal: remove samples that are immediately around positive onset_g events

Motivation:
- onset_g happens at one sample.  However, the kinematics will be similar for the samples surrounding this event
- Fix 1: We will just remove the samples immediately following the event
- Fix 2: We will turn the 10 samples prior to the event to positive examples


In [None]:
# Boolean to tell us whether to keep the samples
keep = np.ones(shape=label_assistance_onset_g.shape)
events = np.where(label_assistance_onset_g)
for e in events[0]:
    # Remove samples immediately after the trigger event
    keep[e+1:e+26] = 0
    # Swich negative class labels before the event to positive
    label_assistance_onset_g[e-10:e] = 1

# Indices of the original data set to keep
indices = np.where(keep > 0)[0]


## Tree classifier

In [None]:
# Only keep a subset of the available data
ins = inputs_pos_vel[indices,:]
outs = label_assistance_onset_g[indices]
time_modified = time[indices]

## Build Classifier

In [None]:
classifier = DecisionTreeClassifier(max_leaf_nodes=????, criterion='log_loss') 

# Class weights can better balance the tree
#, class_weight='balanced')

## Raw training experiment
Use all data for training / evaluation

In [None]:
# Train the one decision tree and evaluate with the same data
classifier.fit(ins, outs)
prob_a = classifier.predict_proba(ins)

In [None]:
# Default decision boundary at .5
pred_main = prob_a[:,1] >= 0.5

# Compute and display the corresponding confusion matrix
confusion = confusion_matrix(outs, pred)
confusion

In [None]:
# Loss goes down as complexity of the tree goes up
#  NOTE: this is not the case if we are overfitting
log_loss(outs, prob_a[:,1])

In [None]:
# Plot probabilities and true label over time
plt.figure()
plt.plot(time_modified, ???, 'r')
plt.plot(time_modified, ???, 'b')
plt.plot(time_modified, outs*.1-.2, 'k')
plt.ylabel('probability')
plt.xlabel('time (s)')

plt.xlim((50,70))
plt.legend(['other', 'movement'])

In [None]:
# Element #1 is the positive prob
fpr, tpr, thresholds = roc_curve(outs, prob_a[:,1])

In [None]:
fig, ax = plt.subplots()
ax.plot(????)
ax.plot(????)
ax.plot(????)

ax.invert_xaxis()
ax.set_xlabel('threshold')
ax.set_ylabel('fraction')
ax.set_xlim([1,0])
ax.legend(['TPR', 'FPR', 'distance'])

In [None]:
# ROC Curve
fig = plt.figure(figsize=(5,4.5))
plt.plot(????)
plt.plot([0,1], [0,1], 'r--')
plt.xlabel('FPR')
plt.ylabel('TPR')
#fig.axes[0].set_aspect('equal', 'box')
auc(fpr, tpr)

## Cross-validation experiment
The video will take you through this section.  

Remember that proba[:,1] are the true class probabilities!

In [None]:
# Data are badly imbalanced.  Need a lot of cv folds to see a positive result
proba = cross_val_predict(???)

# Including n_jobs allows parallel computation

In [None]:
# Positive label threshold default is 0.5
# Display the corresponding confusion matrix
pred = proba[:,1] >= 

confusion = 
confusion

In [None]:
# Plot probabilities and true label over time
plt.figure()
plt.plot(time_modified, ???, 'r')
plt.plot(time_modified, ???, 'b')
plt.plot(time_modified, outs*.1-.2, 'k')
plt.ylabel('probability')
plt.xlabel('time (s)')

plt.xlim((50,70))
plt.legend(['other', 'movement'])

In [None]:
# Evaluate.  Use proba[:,1]
log_loss(???)

In [None]:
# Element #1 is the positive prob
fpr, tpr, thresholds = roc_curve(????)

In [None]:
# TPR / FPR curves
fig, ax = plt.subplots()
ax.plot(thresholds, ???, color='b')
ax.plot(thresholds, ???, color='r')
ax.plot(thresholds, ???, color='g')
ax.invert_xaxis()
ax.set_xlabel('threshold')
ax.set_ylabel('fraction')
ax.set_xlim([1,0])
ax.legend(['TPR', 'FPR', 'distance'])

In [None]:
# ROC curve
fig = plt.figure(figsize=(5,4.5))
plt.plot(????)
plt.plot([0,1], [0,1], 'r--')
plt.xlabel('FPR')
plt.ylabel('TPR')
#fig.axes[0].set_aspect('equal', 'box')
auc(fpr, tpr)

## Render learned tree

In [None]:
export_graphviz(classifier, ???) 

In [None]:
!dot -Tpng -o tree_model.png tree_model.dot

In [None]:
!pwd

In [None]:
display.Image("tree_model.png")