# Machine Learning Practice
## Module 12: Ensembles: Subspaces and Boosting: Baby Example

Andrew H. Fagg (andrewhfagg@gmail.com)

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve, auc
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier

# Default figure parameters
plt.rcParams['figure.figsize'] = (5,5)
plt.rcParams['font.size'] = 10
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['figure.constrained_layout.use'] = True
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# From book
# Pipeline component: select subsets of attributes
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribs):
        self.attribs = attribs
    def fit(self, x, y=None):
        return self
    def transform(self, X):
        return X[self.attribs].values

# Pipeline component: New transformer class: drop all rows that contain invalid values
class DataSampleDropper(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, X):
        return X.dropna(how = 'any')

# Pipeline component: Compute derivatives
class ComputeDerivative(BaseEstimator, TransformerMixin):
    def __init__(self, attribs, dt=1.0, prefix='d_'):
        self.attribs = attribs
        self.dt = dt
        self.prefix = prefix
    def fit(self, x, y=None):
        return self
    def transform(self, X):
        # Compute derivatives
        Xout = X.copy()
        for field in self.attribs:
            # Extract the values for this field
            values = Xout[field].values
            # Compute the difference between subsequent values
            diff = values[1:] - values[0:-1]
            # Bring the length to be the same as original data
            np.append(diff, 0)
            # Name of the new field
            name = self.prefix + field
            # 20 ms time step
            Xout[name] = pd.Series(diff / self.dt)
        return Xout

In [None]:
## Support for identifying kinematic variable columns
def get_kinematic_properties(data):
    # Regular expression for finding kinematic fields
    px = re.compile("_[xyz]$")

    # Find the list of kinematic fields
    fields = list(data)
    fieldsKin = [x for x in fields if px.search(x)]
    return fieldsKin

def position_fields_to_position_and_velocity_fields(fields, prefix='d_'):
    '''
    Given a list of position columns, produce a new list
    of columns that include both position and velocity
    '''
    fields_new = [prefix + x for x in fields]
    return fields + fields_new

In [None]:
def plot_probs(outs, proba):
    pred = proba[:,0] >= 0.5
    confusion = confusion_matrix(outs, pred)
    print("Confusion:", confusion)
    
    # Evaluate
    print("log loss: ", log_loss(outs, proba))
    
    # TPR/FPR plot
    # Class 0 = Movement; Class 1 = no movement
    fpr, tpr, thresholds = roc_curve(outs, proba[:,0])
    fig, ax = plt.subplots()
    ax.plot(thresholds, tpr, color='b')
    ax.plot(thresholds, fpr, color='r')
    ax.plot(thresholds, tpr - fpr, color='g')
    ax.invert_xaxis()
    ax.set_xlabel('threshold')
    ax.set_ylabel('fraction')
    ax.legend(['TPR', 'FPR', 'distance'])
    
    # ROC plot
    fig, ax = plt.subplots()
    ax.plot(fpr, tpr, color='b')
    ax.plot([0,1], [0,1], 'r--')
    ax.set_xlabel('FPR')
    ax.set_ylabel('TPR')
    ax.set_aspect('equal', 'box')
    print("AUC:", auc(fpr, tpr))
    
def plot_time_series(time_modified, outs, proba):
    plt.figure()
    plt.plot(time_modified, proba[:,0], 'r')
    plt.plot(time_modified, proba[:,1], 'g')
    plt.plot(time_modified, outs*.1-.2, 'k')
    plt.ylabel('probability')
    plt.xlabel('time (s)')

    plt.xlim((50,70))
    plt.legend(['movement', 'no movement'])

## Load and organize data

In [None]:
# Note: you may need to change this path to get to the data
fname = '/home/fagg/datasets/baby1/subject_k2_w10.csv'
#fname = '/content/drive/MyDrive/MLP_2021/datasets/baby1/subject_k2_w10.csv'
infant_data = pd.read_csv(fname)

In [None]:
time = infant_data['time'].values
action = infant_data['sippc_action'].values

In [None]:
fieldsKin = get_kinematic_properties(infant_data)
fieldsKinVel = position_fields_to_position_and_velocity_fields(fieldsKin)

## Create Pipelines

In [None]:
prepipe = Pipeline([
    ('derivative', ComputeDerivative(fieldsKin, dt=.02)),
    ('dropper', DataSampleDropper())
])

# Position pipe
pipe_pos = Pipeline([('selector', DataFrameSelector(fieldsKin))])

# Position + velocity selector
pipe_pos_vel = Pipeline([('selector', DataFrameSelector(fieldsKinVel))])

# Robot action
attribs_label = ['sippc_action']
pipe_label = Pipeline([('selector', DataFrameSelector(attribs_label))])

# Time
attribs_time = ['time']
pipe_time = Pipeline([('selector', DataFrameSelector(attribs_time))])


In [None]:
infant_data2 = prepipe.fit_transform(infant_data)

# Selection
inputs_pos = pipe_pos.transform(infant_data2)
inputs_pos_vel = pipe_pos_vel.transform(infant_data2)
action = pipe_label.transform(infant_data2).reshape((-1,))
time = pipe_time.transform(infant_data2)

## Create Class Labels

In [None]:
label_motion = action > 0

# Action onset (any action)

label_assistance_onset_any = (action[0:-1] == 0) & (action[1:] > 0) & (action[1:] <= 8)
label_assistance_onset_any = np.append(label_assistance_onset_any, 0)

# Action onset: power steering
label_assistance_onset_ps = (action[0:-1] == 0) & (action[1:] > 0) & (action[1:] <= 4)
label_assistance_onset_ps = np.append(label_assistance_onset_ps, 0)

# Action onset: gesture
label_assistance_onset_g = (action[0:-1] == 0) & (action[1:] >= 5) & (action[1:] <= 8)
label_assistance_onset_g = np.append(label_assistance_onset_g, 0)


## Filter Data Set
Our goal: remove samples that are immediately around positive onset_g events

In [None]:
# Boolean to tell us whether to keep the samples
keep = np.ones(shape=label_assistance_onset_g.shape)
events = np.where(label_assistance_onset_g)
for e in events[0]:
    # Remove samples immediately after the trigger event
    keep[e+1:e+26] = 0
    # Swich negative class labels before the event to positive
    label_assistance_onset_g[e-10:e] = 1

# Indices of the original data set to keep
indices = np.where(keep > 0)[0]


## Tree classifier

In [None]:
# Only keep a subset of the available data
ins = inputs_pos_vel[indices,:]
outs = label_assistance_onset_g[indices]
time_modified = time[indices]

In [None]:
classifier = DecisionTreeClassifier(max_leaf_nodes=10)


In [None]:
proba = cross_val_predict(classifier, ins,  outs, cv=10, 
                          method='predict_proba')

In [None]:
plot_probs(outs, proba)

In [None]:
plt.figure()
plt.hist(proba[:,0], alpha = 0.5)
plot_time_series(time_modified, outs, proba)

## Bagging Classifier
### Subspaces

In [None]:
# Start with small number of leaf nodes and estimators
classifier = 
bagging_classifier = 

In [None]:
plot_probs(outs, proba)

In [None]:
plot_time_series(time_modified, outs, proba)

# Random Forests

In [None]:
# Sample size for each is the same as the training set!



In [None]:
proba = cross_val_predict(forest_classifier, ins,  outs, cv=10, 
                          method='predict_proba')
plot_probs(outs, proba)

In [None]:
plot_time_series(time_modified, outs, proba)

# Feature Importance

In [None]:
fig, ax = plt.subplots()
ax.hist(forest_classifier.feature_importances_)
plt.xlabel('Importance')
plt.ylabel('Count')

# Boosting

In [None]:

classifier = DecisionTreeClassifier(max_leaf_nodes=10)
ada_classifier = 

In [None]:
proba = cross_val_predict(ada_classifier, ins,  outs, cv=10, 
                          method='predict_proba', n_jobs=-1)
plot_probs(outs, proba)

In [None]:
plot_time_series(time_modified, outs, proba)