In [107]:
import pandas as pd
import numpy as np
from numpy import genfromtxt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from collections import Counter

In [2]:
data_dir = 'bbdc_2019_Bewegungsdaten/'

In [3]:
def file2mat(filename: str) -> np.ndarray:
    mat = genfromtxt(f'{data_dir}{filename}', delimiter=',')
    return mat

In [4]:
train = pd.read_csv('splits/train.csv')

In [5]:
valid = pd.read_csv('splits/valid.csv')

In [110]:
Counter(train['Label'].values).most_common()

[('v-cut-left-Rfirst', 240),
 ('run', 240),
 ('curve-left-spin-Rfirst', 240),
 ('v-cut-right-Rfirst', 240),
 ('curve-right-spin-Rfirst', 240),
 ('stand-to-sit', 240),
 ('curve-right-spin-Lfirst', 240),
 ('curve-left-spin-Lfirst', 240),
 ('walk', 240),
 ('sit', 240),
 ('v-cut-left-Lfirst', 240),
 ('jump-two-leg', 240),
 ('stand', 240),
 ('sit-to-stand', 240),
 ('v-cut-right-Lfirst', 239),
 ('curve-left-step', 239),
 ('jump-one-leg', 239),
 ('lateral-shuffle-left', 239),
 ('curve-right-step', 237),
 ('lateral-shuffle-right', 237),
 ('stair-down', 234),
 ('stair-up', 234)]

In [6]:
def feature_engineer(mat):
    avg = mat.mean(axis=0).reshape(1, -1)
    min = mat.min(axis=0).reshape(1, -1)
    max = mat.max(axis=0).reshape(1, -1)
    std = mat.std(axis=0).reshape(1, -1)
    return np.concatenate((max, avg, min, std), axis=1).flatten()

In [7]:
train['features'] = train['Datafile'].apply(lambda x: feature_engineer(file2mat(x)))

In [8]:
valid['features'] = valid['Datafile'].apply(lambda x: feature_engineer(file2mat(x)))

In [9]:
train.shape

(5258, 4)

In [111]:
train_features = np.concatenate(train['features'].values).reshape(-1, 19*4)
# train_features /= train_features.max(axis=0)
valid_features = np.concatenate(valid['features'].values).reshape(-1, 19*4)
# valid_features /= valid_features.max(axis=0)

In [102]:
lr = LogisticRegression(solver='liblinear')
lr.fit(np.concatenate(train_features, train['Label'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [103]:
preds_lr = lr.predict(valid_features)
accuracy_score(preds_lr, valid['Label'])

0.45962732919254656

In [109]:
print(classification_report(valid['Label'], preds_lr))

                         precision    recall  f1-score   support

 curve-left-spin-Lfirst       0.46      0.82      0.59        40
 curve-left-spin-Rfirst       0.09      0.03      0.04        80
        curve-left-step       0.00      0.00      0.00        60
curve-right-spin-Lfirst       0.23      0.41      0.29        61
curve-right-spin-Rfirst       0.45      0.53      0.49        60
       curve-right-step       0.58      0.33      0.42        58
           jump-one-leg       0.23      0.80      0.36        40
           jump-two-leg       0.56      0.90      0.69        40
   lateral-shuffle-left       0.31      0.42      0.36        43
  lateral-shuffle-right       0.28      0.17      0.22        40
                    run       0.56      0.15      0.24        60
                    sit       1.00      0.96      0.98        49
           sit-to-stand       0.86      0.98      0.91        49
             stair-down       0.47      0.93      0.62        44
               stair-up 

In [112]:
# pca = PCA()
# pca.fit(np.concatenate((train_features, valid_features), axis=0))

In [113]:
# pca.explained_variance_ratio_ > 1e-3

In [114]:
# train_features = pca.transform(train_features)[:, :10]
# valid_features = pca.transform(valid_features)[:, :10]

In [121]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=4, n_jobs=3)
rfc.fit(train_features, train['Label'])
accuracy_score(rfc.predict(train_features), train['Label'])

0.6494864967668315

In [122]:
preds_rf = rfc.predict(valid_features)
accuracy_score(preds_rf, valid['Label'])

0.4738243123336291

In [96]:
knn = KNeighborsClassifier(n_neighbors=7, n_jobs=3)
knn.fit(train_features, train['Label'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=3, n_neighbors=7, p=2,
           weights='uniform')

In [98]:
preds_knn = knn.predict(valid_features)
accuracy_score(preds_knn, valid['Label'])

0.2688553682342502

In [100]:
gbm = GradientBoostingClassifier(n_estimators=70, max_depth=3)
gbm.fit(train_features, train['Label'])

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=70,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [101]:
preds_gbm = gbm.predict(valid_features)
accuracy_score(preds_gbm, valid['Label'])

0.2803904170363798

## Ensemble

In [19]:
preds = rfc.predict_proba(np.concatenate(valid['features'].values).reshape(-1, 19*4))
preds += gbm.predict_proba(np.concatenate(valid['features'].values).reshape(-1, 19*4))
preds += knn.predict_proba(np.concatenate(valid['features'].values).reshape(-1, 19*4))
# preds += lr.predict_proba(np.concatenate(valid['features'].values).reshape(-1, 19*4))
preds /= 3

In [20]:
preds = np.argmax(preds, axis=1)

In [21]:
preds = [gbm.classes_[x] for x in preds]

In [22]:
accuracy_score(preds, valid['Label'])

0.5536823425022183

In [111]:
valid['Subject'].unique()

array(['Subject02', 'Subject07'], dtype=object)

In [24]:
test = pd.read_csv('bbdc_2019_Bewegungsdaten/challenge.csv')

In [25]:
test['features'] = test['Datafile'].apply(lambda x: feature_engineer(file2mat(x)))

### Investigate discrepancy between train and challenge

In [23]:
np.concatenate(test['features'].values).reshape(-1, 19*3).min(axis=0)

NameError: name 'test' is not defined

In [86]:
np.concatenate(valid['features'].values).reshape(-1, 19*3).min(axis=0)

array([3.28880000e+04, 3.29410000e+04, 3.29080000e+04, 3.29230000e+04,
       3.33370000e+04, 3.22400000e+04, 3.32800000e+04, 2.81600000e+04,
       1.56590000e+04, 2.95360000e+04, 2.95760000e+04, 2.92520000e+04,
       1.78560000e+04, 2.92840000e+04, 2.91260000e+04, 2.91440000e+04,
       2.56040000e+04, 3.15640000e+04, 3.08960000e+04, 3.27275851e+04,
       3.27383598e+04, 3.26433038e+04, 3.27788802e+04, 3.27435188e+04,
       3.18995869e+04, 3.32361236e+04, 2.81206828e+04, 1.55705654e+04,
       2.80662839e+04, 2.68956402e+04, 2.74722177e+04, 1.77837679e+04,
       2.79114388e+04, 2.77576210e+04, 2.68127574e+04, 2.41710691e+04,
       2.75200116e+04, 2.80450006e+04, 2.21700000e+03, 1.20000000e+01,
       1.52440000e+04, 4.60200000e+03, 7.00000000e+00, 1.16660000e+04,
       1.64200000e+04, 8.96100000e+03, 1.50020000e+04, 7.80000000e+01,
       7.50000000e+01, 7.70000000e+01, 1.63040000e+04, 9.70000000e+01,
       9.50000000e+01, 9.50000000e+01, 6.10000000e+01, 6.00000000e+01,
      

In [87]:
np.concatenate(train['features'].values).reshape(-1, 19*3).min(axis=0)

array([32860.        , 32891.        , 32908.        , 32907.        ,
           0.        ,     0.        , 33280.        , 28160.        ,
       15659.        , 29536.        , 29576.        , 29252.        ,
       14432.        , 29117.        , 29126.        , 29120.        ,
       25604.        , 29377.        , 29324.        , 32724.70952125,
       32650.11540648, 32643.30379169, 32561.41713371,     0.        ,
           0.        , 33236.1236102 , 28120.68279922, 15570.56537619,
       28066.28388928, 26895.6401766 , 28169.94053745, 14314.69622528,
       27854.22940818, 24513.13003452, 22573.18094321, 24171.06910246,
       24966.33063701, 24562.41609421,     0.        ,     0.        ,
           0.        ,     0.        ,     0.        ,     0.        ,
        8781.        ,  8600.        , 15002.        ,    79.        ,
          75.        ,    77.        , 12480.        ,    96.        ,
          93.        ,    93.        ,    59.        ,    58.        ,
      

### Make Submission

In [96]:
preds = rfc.predict_proba(np.concatenate(test['features'].values).reshape(-1, 19*3))
preds += gbm.predict_proba(np.concatenate(test['features'].values).reshape(-1, 19*3))
preds += knn.predict_proba(np.concatenate(test['features'].values).reshape(-1, 19*3))
# preds /= 3
preds = np.argmax(preds, axis=1)
preds = [gbm.classes_[x] for x in preds]


In [97]:
test['Label'] = preds
new_test = test.drop('features', axis=1, inplace=True)

In [75]:
new_test.to_csv('submission1.csv', index=False)

In [87]:
test.index

RangeIndex(start=0, stop=1738, step=1)

In [88]:
rfc.classes_

array(['curve-left-spin-Lfirst', 'curve-left-spin-Rfirst',
       'curve-left-step', 'curve-right-spin-Lfirst',
       'curve-right-spin-Rfirst', 'curve-right-step', 'jump-one-leg',
       'jump-two-leg', 'lateral-shuffle-left', 'lateral-shuffle-right',
       'run', 'sit', 'sit-to-stand', 'stair-down', 'stair-up', 'stand',
       'stand-to-sit', 'v-cut-left-Lfirst', 'v-cut-left-Rfirst',
       'v-cut-right-Lfirst', 'v-cut-right-Rfirst', 'walk'], dtype=object)

In [89]:
gbm.classes_

array(['curve-left-spin-Lfirst', 'curve-left-spin-Rfirst',
       'curve-left-step', 'curve-right-spin-Lfirst',
       'curve-right-spin-Rfirst', 'curve-right-step', 'jump-one-leg',
       'jump-two-leg', 'lateral-shuffle-left', 'lateral-shuffle-right',
       'run', 'sit', 'sit-to-stand', 'stair-down', 'stair-up', 'stand',
       'stand-to-sit', 'v-cut-left-Lfirst', 'v-cut-left-Rfirst',
       'v-cut-right-Lfirst', 'v-cut-right-Rfirst', 'walk'], dtype=object)

In [90]:
knn.classes_

array(['curve-left-spin-Lfirst', 'curve-left-spin-Rfirst',
       'curve-left-step', 'curve-right-spin-Lfirst',
       'curve-right-spin-Rfirst', 'curve-right-step', 'jump-one-leg',
       'jump-two-leg', 'lateral-shuffle-left', 'lateral-shuffle-right',
       'run', 'sit', 'sit-to-stand', 'stair-down', 'stair-up', 'stand',
       'stand-to-sit', 'v-cut-left-Lfirst', 'v-cut-left-Rfirst',
       'v-cut-right-Lfirst', 'v-cut-right-Rfirst', 'walk'], dtype=object)

In [92]:
Counter(preds)

Counter({'run': 51,
         'curve-right-spin-Lfirst': 58,
         'stair-up': 50,
         'stair-down': 57,
         'v-cut-right-Lfirst': 53,
         'curve-left-step': 67,
         'curve-right-spin-Rfirst': 57,
         'sit-to-stand': 62,
         'stand': 62,
         'jump-two-leg': 56,
         'lateral-shuffle-left': 49,
         'jump-one-leg': 54,
         'sit': 46,
         'curve-left-spin-Rfirst': 70,
         'v-cut-right-Rfirst': 64,
         'v-cut-left-Rfirst': 58,
         'curve-left-spin-Lfirst': 64,
         'lateral-shuffle-right': 55,
         'stand-to-sit': 46,
         'v-cut-left-Lfirst': 70,
         'curve-right-step': 66,
         'walk': 62})

In [60]:
test['Subject'].unique()

array(['Subject01', 'Subject10', 'Subject14', 'Subject15'], dtype=object)

In [61]:
set(valid['Subject'].unique()).difference(train['Subject'].unique())

{'Subject11', 'Subject16', 'Subject19'}

In [62]:
set(train['Subject'].unique()).difference(valid['Subject'].unique())

{'Subject02',
 'Subject03',
 'Subject04',
 'Subject05',
 'Subject06',
 'Subject07',
 'Subject08',
 'Subject09',
 'Subject12',
 'Subject13',
 'Subject17',
 'Subject18'}