# Exercise 4

In [1]:
%reload_ext autoreload
import numpy as np
import pandas as pd
import os

from tqdm import tqdm

from sklearn.model_selection import LeaveOneOut, KFold, TimeSeriesSplit

from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB


In [2]:
# load data from csv file
Folder_window = os.path.join('data', 'windowFrames.csv')
windowFrame = pd.read_csv(Folder_window)

windowFrame = windowFrame.set_index(['time'])
windowFrame.index = pd.to_datetime(windowFrame.index)

# Remove testing frames
windowFrame.drop(windowFrame[windowFrame['label']=='testing'].index, inplace=True)

In [3]:
windowFrame.columns

Index(['alpha_min', 'alpha_max', 'alpha_median', 'alpha_std', 'alphaG_min',
       'alphaG_max', 'alphaG_median', 'alphaG_std', 'beta_min', 'beta_max',
       'beta_median', 'beta_std', 'betaG_min', 'betaG_max', 'betaG_median',
       'betaG_std', 'count_min', 'count_max', 'count_median', 'count_std',
       'gamma_min', 'gamma_max', 'gamma_median', 'gamma_std', 'gammaG_min',
       'gammaG_max', 'gammaG_median', 'gammaG_std', 'lux_min', 'lux_max',
       'lux_median', 'lux_std', 'label', 'subject'],
      dtype='object')

**Extra:** Classifiers have array parameters and not dataframes!

In [4]:
np_windowFrame = windowFrame.to_numpy()
data = np_windowFrame[:,:-2] # all data without label and subject columns
label = np_windowFrame[:,-2] # just the column label with the labels

## a. Build test-train-split and classifiers

In [5]:
clfs = [
    (GaussianNB(), 'Gaussian Nayve Bayes'),
    (KNeighborsClassifier(10), 'Nearest Neighbors'),
    (DecisionTreeClassifier(max_depth=15), "Decision Tree"),
    (RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), "Random Forest")
]

In [6]:
def split_clf(splitter, num_splits, df, lb): #df = dataframe, lb = label array
    scores = np.zeros((len(clfs), num_splits))
    total_cm = [[[0,0,0],[0,0,0],[0,0,0]]] * len(clfs) #cm = confusion matrix
    for split_idx, (train_idx, test_idx) in tqdm(enumerate(splitter.split(df)),total=num_splits):
            train_data, test_data = df[train_idx], df[test_idx]
            train_label, test_label = lb[train_idx], lb[test_idx]
            for clf_idx , (clf, name) in enumerate(clfs):
                clf.fit(train_data, train_label)
                cm = confusion_matrix(test_label, clf.predict(test_data), ['sitting', 'standing', 'walking'])
                total_cm[clf_idx] = total_cm[clf_idx] + cm
                scores[clf_idx, split_idx] = clf.score(test_data, test_label)
    mean_accuracy = np.mean(scores, axis=1)
    for (clf, name), mean_acc, cm in zip(clfs, mean_accuracy, total_cm):
        print(name) # for each classifier
        print('Mean accuracy:', mean_acc) # mean accuracy
        print('Confusion Matrix\n [Sitting, Standing, Walking]\n ', cm, '\n') #confusion_matrix

## b. Leave One Out

In [7]:
split_clf(LeaveOneOut(), len(label), data, label)

100%|██████████| 1126/1126 [00:47<00:00, 23.80it/s]

Gaussian Nayve Bayes
Mean accuracy: 0.9351687388987566
Confusion Matrix
 [Sitting, Standing, Walking]
  [[370   5  37]
 [  6 340   0]
 [ 25   0 343]] 

Nearest Neighbors
Mean accuracy: 0.9538188277087034
Confusion Matrix
 [Sitting, Standing, Walking]
  [[371  12  29]
 [  0 346   0]
 [  3   8 357]] 

Decision Tree
Mean accuracy: 0.9591474245115453
Confusion Matrix
 [Sitting, Standing, Walking]
  [[390   2  20]
 [  3 343   0]
 [ 21   0 347]] 

Random Forest
Mean accuracy: 0.9591474245115453
Confusion Matrix
 [Sitting, Standing, Walking]
  [[380   7  25]
 [  1 344   1]
 [  9   3 356]] 






## c. Other train_control
- ###  K-Fold cross-validation

Leave One Out is a K-Fold method with len(label) as n_splits, len(label)=1076

In [8]:
n_splits = 10
split_clf(KFold(n_splits=n_splits), n_splits, data, label)

100%|██████████| 10/10 [00:00<00:00, 18.75it/s]

Gaussian Nayve Bayes
Mean accuracy: 0.9130214917825539
Confusion Matrix
 [Sitting, Standing, Walking]
  [[360   5  47]
 [ 14 332   0]
 [ 32   0 336]] 

Nearest Neighbors
Mean accuracy: 0.9228192161820481
Confusion Matrix
 [Sitting, Standing, Walking]
  [[350  26  36]
 [  1 345   0]
 [ 11  13 344]] 

Decision Tree
Mean accuracy: 0.9209307838179519
Confusion Matrix
 [Sitting, Standing, Walking]
  [[367   2  43]
 [  5 341   0]
 [ 39   0 329]] 

Random Forest
Mean accuracy: 0.9272360935524653
Confusion Matrix
 [Sitting, Standing, Walking]
  [[357  17  38]
 [  2 344   0]
 [ 22   3 343]] 






- ### Time Series Split

In [9]:
n_splits = 10
split_clf(TimeSeriesSplit(n_splits=n_splits), n_splits, data, label)

100%|██████████| 10/10 [00:00<00:00, 25.76it/s]

Gaussian Nayve Bayes
Mean accuracy: 0.8186274509803922
Confusion Matrix
 [Sitting, Standing, Walking]
  [[301   5   0]
 [ 70 276   0]
 [110   0 258]] 

Nearest Neighbors
Mean accuracy: 0.7156862745098038
Confusion Matrix
 [Sitting, Standing, Walking]
  [[287  19   0]
 [118 228   0]
 [132  21 215]] 

Decision Tree
Mean accuracy: 0.7833333333333332
Confusion Matrix
 [Sitting, Standing, Walking]
  [[281  25   0]
 [ 63 283   0]
 [133   0 235]] 

Random Forest
Mean accuracy: 0.8617647058823529
Confusion Matrix
 [Sitting, Standing, Walking]
  [[302   4   0]
 [ 12 334   0]
 [125   0 243]] 






## d. Add more feature manually
if (acc_x_with_g - acc_x > acc_y_with_g - acc_y) & (acc_x_with_g - acc_x > acc_z_with_g - acc_z) then new feature up_down_side = 0

if (acc_x_with_g - acc_x < acc_y_with_g - acc_y) & (acc_x_with_g - acc_x < acc_z_with_g - acc_z) then new feature up_down_side = 1

else up_down_side = 2

In [10]:
windowFrame['g_alpha_median'] = abs(windowFrame['alphaG_median'] - windowFrame['alpha_median'])
windowFrame['g_beta_median'] = abs(windowFrame['betaG_median'] - windowFrame['beta_median'])
windowFrame['g_gamma_median'] = abs(windowFrame['gammaG_median'] - windowFrame['gamma_median'])
windowFrame.head()

Unnamed: 0_level_0,alpha_min,alpha_max,alpha_median,alpha_std,alphaG_min,alphaG_max,alphaG_median,alphaG_std,beta_min,beta_max,...,gammaG_std,lux_min,lux_max,lux_median,lux_std,label,subject,g_alpha_median,g_beta_median,g_gamma_median
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-10-02 11:27:33+00:00,3632.0,6643.0,4685.0,1283.491128,6400.0,13121.0,8035.0,2571.444517,-1976.0,124.0,...,2976.416046,15666.0,22000.0,20000.0,1915.086322,sitting,Enes,3350.0,1528.0,7260.0
2019-10-02 11:27:34+00:00,-5608.0,2909.0,-1406.0,2209.106239,3650.0,10604.0,7315.0,1903.687849,-7044.0,4782.0,...,2333.917672,0.0,22000.0,13000.0,8669.028018,sitting,Enes,8721.0,2637.0,2047.0
2019-10-02 11:27:35+00:00,-5616.0,-433.0,-3931.5,1670.491396,2209.0,3739.0,3195.5,385.667241,-203.0,4128.0,...,177.769395,0.0,0.0,0.0,0.0,sitting,Enes,7127.0,4312.0,4229.0
2019-10-02 11:27:36+00:00,-999.0,672.0,119.5,392.131221,2203.0,3901.0,3345.5,451.056246,-1244.0,410.0,...,281.100675,0.0,0.0,0.0,0.0,sitting,Enes,3226.0,2769.5,8800.0
2019-10-02 11:27:37+00:00,-319.0,494.0,32.0,187.219151,2467.0,3420.0,3008.0,233.135713,-601.0,604.0,...,375.65121,0.0,0.0,0.0,0.0,sitting,Enes,2976.0,1604.5,9204.0


In [11]:
conditions = [
    (windowFrame['g_alpha_median'] >= windowFrame['g_beta_median']) & (windowFrame['g_alpha_median'] >= windowFrame['g_gamma_median']),
    (windowFrame['g_beta_median'] >= windowFrame['g_alpha_median']) & (windowFrame['g_beta_median'] >= windowFrame['g_gamma_median']),
    (windowFrame['g_gamma_median'] >= windowFrame['g_alpha_median']) & (windowFrame['g_gamma_median'] >= windowFrame['g_beta_median']),
]
choices = [0, 1, 2] # 0 = device side-ways, 1 = device uppwards, 2 = device laying
windowFrame['orientation'] = np.select(conditions, choices, default=1)
windowFrame.head()

Unnamed: 0_level_0,alpha_min,alpha_max,alpha_median,alpha_std,alphaG_min,alphaG_max,alphaG_median,alphaG_std,beta_min,beta_max,...,lux_min,lux_max,lux_median,lux_std,label,subject,g_alpha_median,g_beta_median,g_gamma_median,orientation
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-10-02 11:27:33+00:00,3632.0,6643.0,4685.0,1283.491128,6400.0,13121.0,8035.0,2571.444517,-1976.0,124.0,...,15666.0,22000.0,20000.0,1915.086322,sitting,Enes,3350.0,1528.0,7260.0,2
2019-10-02 11:27:34+00:00,-5608.0,2909.0,-1406.0,2209.106239,3650.0,10604.0,7315.0,1903.687849,-7044.0,4782.0,...,0.0,22000.0,13000.0,8669.028018,sitting,Enes,8721.0,2637.0,2047.0,0
2019-10-02 11:27:35+00:00,-5616.0,-433.0,-3931.5,1670.491396,2209.0,3739.0,3195.5,385.667241,-203.0,4128.0,...,0.0,0.0,0.0,0.0,sitting,Enes,7127.0,4312.0,4229.0,0
2019-10-02 11:27:36+00:00,-999.0,672.0,119.5,392.131221,2203.0,3901.0,3345.5,451.056246,-1244.0,410.0,...,0.0,0.0,0.0,0.0,sitting,Enes,3226.0,2769.5,8800.0,2
2019-10-02 11:27:37+00:00,-319.0,494.0,32.0,187.219151,2467.0,3420.0,3008.0,233.135713,-601.0,604.0,...,0.0,0.0,0.0,0.0,sitting,Enes,2976.0,1604.5,9204.0,2


In [12]:
data_new = windowFrame.drop(['label', 'subject'], axis=1).to_numpy() # all data without label and subject columns
label_new = windowFrame['label'].to_numpy() # just the column label with the labels

### Compare to previous exercise

- Leave One Out with new feature orientation

In [13]:
split_clf(LeaveOneOut(), len(label), data_new, label_new)

100%|██████████| 1126/1126 [00:45<00:00, 24.53it/s]

Gaussian Nayve Bayes
Mean accuracy: 0.9369449378330373
Confusion Matrix
 [Sitting, Standing, Walking]
  [[368   5  39]
 [  6 340   0]
 [ 21   0 347]] 

Nearest Neighbors
Mean accuracy: 0.9564831261101243
Confusion Matrix
 [Sitting, Standing, Walking]
  [[374  11  27]
 [  0 346   0]
 [  3   8 357]] 

Decision Tree
Mean accuracy: 0.9609236234458259
Confusion Matrix
 [Sitting, Standing, Walking]
  [[388   2  22]
 [  3 343   0]
 [ 17   0 351]] 

Random Forest
Mean accuracy: 0.9582593250444049
Confusion Matrix
 [Sitting, Standing, Walking]
  [[377   5  30]
 [  0 346   0]
 [ 11   1 356]] 






- K-Fold with new feature orientation

In [14]:
n_splits = 10
split_clf(KFold(n_splits=n_splits), n_splits, data_new, label_new)

100%|██████████| 10/10 [00:00<00:00, 20.05it/s]

Gaussian Nayve Bayes
Mean accuracy: 0.917501580278129
Confusion Matrix
 [Sitting, Standing, Walking]
  [[358   5  49]
 [ 14 332   0]
 [ 25   0 343]] 

Nearest Neighbors
Mean accuracy: 0.9228271175726928
Confusion Matrix
 [Sitting, Standing, Walking]
  [[349  26  37]
 [  1 345   0]
 [ 10  13 345]] 

Decision Tree
Mean accuracy: 0.9333833754740833
Confusion Matrix
 [Sitting, Standing, Walking]
  [[376   2  34]
 [  8 338   0]
 [ 31   0 337]] 

Random Forest
Mean accuracy: 0.9360856510745892
Confusion Matrix
 [Sitting, Standing, Walking]
  [[368   5  39]
 [  2 343   1]
 [ 23   2 343]] 






- Time Series Split with new feature orientation

In [15]:
n_splits = 10
split_clf(TimeSeriesSplit(n_splits=n_splits), n_splits, data_new, label_new)

100%|██████████| 10/10 [00:00<00:00, 26.65it/s]

Gaussian Nayve Bayes
Mean accuracy: 0.8343137254901961
Confusion Matrix
 [Sitting, Standing, Walking]
  [[301   5   0]
 [ 66 280   0]
 [ 98   0 270]] 

Nearest Neighbors
Mean accuracy: 0.7176470588235293
Confusion Matrix
 [Sitting, Standing, Walking]
  [[286  20   0]
 [118 228   0]
 [131  19 218]] 

Decision Tree
Mean accuracy: 0.7970588235294118
Confusion Matrix
 [Sitting, Standing, Walking]
  [[281  25   0]
 [ 53 293   0]
 [129   0 239]] 

Random Forest
Mean accuracy: 0.7872549019607844
Confusion Matrix
 [Sitting, Standing, Walking]
  [[298   8   0]
 [ 75 271   0]
 [132   2 234]] 






## e. Recursive Feature Elimination

In [16]:
column_names = windowFrame.drop(['label', 'subject'], axis=1).columns
clf = DecisionTreeClassifier(max_depth=15)
rfe = RFE(clf,10,step=1)
rfe.fit(data_new, label_new)
columns = [column_name for support, column_name in zip(rfe.support_,column_names) if support]
columns

['alpha_std',
 'alphaG_min',
 'alphaG_std',
 'count_max',
 'gamma_std',
 'gammaG_median',
 'gammaG_std',
 'lux_min',
 'lux_max',
 'lux_median']

# Exercise 5
## a. Export Classifier

In [17]:
from sklearn_porter import Porter

In [18]:
clf = DecisionTreeClassifier(max_depth=15)
clf.fit(data_new, label_new)
porter = Porter(clf, language='js')
export = porter.export(embed_data=True)
f = open("U5-clf_js/decision_tree.js", "w")
f.write(export)
f.close()