In [1]:
import os
import numpy as np

import prepare_data

from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier 
from sklearn.decomposition import PCA

In [2]:
data_path = os.path.join("statistic_features", "stat_features_60s.csv")
patient_list = ['002','003','005','007','08a','08b','09a','09b', '10a','011','013','014','15a','15b','016',
            '017','018','019','020','021','022','023','025','026','027','028','029','030','031','032',
            '033','034','035','036','037','038','040','042','043','044','045','047','048','049','051']
statistics_list = ["std_x", "std_y", "std_z"]

In [3]:
kf = KFold(n_splits=5, random_state=5, shuffle=True) # Define the split - into 3 folds #5
kf.get_n_splits(patient_list) # returns the number of splitting iterations in the cross-validator

5

In [4]:
for train_index, test_index in kf.split(patient_list):
    #train_patient_list = [patient_list[i] for i in train_index]
    test_patient_list = [patient_list[i] for i in test_index]
    
    print(test_patient_list)

['007', '08a', '09a', '025', '028', '029', '031', '044', '048']
['002', '005', '08b', '021', '026', '027', '032', '034', '049']
['003', '013', '014', '15b', '020', '022', '035', '036', '045']
['09b', '15a', '017', '019', '023', '037', '042', '047', '051']
['10a', '011', '016', '018', '030', '033', '038', '040', '043']


In [9]:
n_others_windows = 30

In [12]:
#prepare_data.save_statistic_features(patient_list, 
#                                   sorce_path="ICHI14_dataset\data",
#                                   save_path=data_path,
#                                   window_len=60)

## 1. Logistic regression
### 1.1 Statistic feature - std

In [10]:
%%time
accuracy_list = []
for train_index, test_index in kf.split(patient_list):
    train_patient_list = [patient_list[i] for i in train_index]
    test_patient_list = [patient_list[i] for i in test_index]
    
    X_train, y_train = prepare_data.load_stat_features_others_windows(train_patient_list,
                                                                      data_path=data_path,
                                                                      n_others_windows=n_others_windows)
    X_test, y_test = prepare_data.load_stat_features_others_windows(test_patient_list,
                                                                    data_path=data_path, 
                                                                    n_others_windows=n_others_windows)
    
    #print(X_test.shape)
    model1 = LogisticRegression()
    model1.fit(X_train, y_train)
    
    y_predict = model1.predict(X_train)
    accuracy_train = metrics.accuracy_score(y_train, y_predict)
    print("\nAccuracy on train set: ", accuracy_train)
    
    y_predict = model1.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)
    print("Accuracy on test set: ", accuracy)
    
    #print(metrics.classification_report(y_test, y_predict))
    #print(test_patient_list)

print("\nMean accuracy =", np.mean(accuracy_list))    


Accuracy on train set:  0.7747249615231147
Accuracy on test set:  0.6871151113216485

Accuracy on train set:  0.7635520056381042
Accuracy on test set:  0.7334318277754327

Accuracy on train set:  0.7588885738588035
Accuracy on test set:  0.7685230024213076

Accuracy on train set:  0.7456644079476283
Accuracy on test set:  0.8115375775683751

Accuracy on train set:  0.7570093457943925
Accuracy on test set:  0.7722016651248844

Mean accuracy = 0.7545618368423297
Wall time: 10.4 s


#### 1.1 Results:

8 windows: acc = 0.7292

12 windows: acc = 0.7395

22 windows: acc = 0.7525

30 windows: acc = 0.7541 -  best

32 windows: acc = 0.7545  - best

40 windows: acc = 0.7530

50 windows: acc = 0.7500

60 windows: acc = 0.7454


### 1.2 Statistic feature - std, age, gender

In [14]:
statistics_list_age = statistics_list + ["age", "gender"]

In [15]:
%%time
accuracy_list = []
for train_index, test_index in kf.split(patient_list):
    train_patient_list = [patient_list[i] for i in train_index]
    test_patient_list = [patient_list[i] for i in test_index]
    
    X_train, y_train = prepare_data.load_stat_features_others_windows(train_patient_list,
                                                                      data_path=data_path,
                                                                      statistics_list = statistics_list_age,
                                                                      n_others_windows=n_others_windows)
    X_test, y_test = prepare_data.load_stat_features_others_windows(test_patient_list,
                                                                    data_path=data_path, 
                                                                    statistics_list = statistics_list_age,
                                                                    n_others_windows=n_others_windows)
    
    #print(X_test.shape)
    model1 = LogisticRegression()
    model1.fit(X_train, y_train)
    
    y_predict = model1.predict(X_train)
    accuracy_train = metrics.accuracy_score(y_train, y_predict)
    print("\nAccuracy on train set: ", accuracy_train)
    
    y_predict = model1.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)
    print("Accuracy on test set: ", accuracy)
    
    #print(metrics.classification_report(y_test, y_predict))
    #print(test_patient_list)

print("\nMean accuracy =", np.mean(accuracy_list))    


Accuracy on train set:  0.7741719264927666
Accuracy on test set:  0.6822820037105751

Accuracy on train set:  0.764766779777995
Accuracy on test set:  0.7303231151615576

Accuracy on train set:  0.7540983606557377
Accuracy on test set:  0.7789099526066351

Accuracy on train set:  0.7502531788004951
Accuracy on test set:  0.7950911956766494

Accuracy on train set:  0.755856412561092
Accuracy on test set:  0.7720888083371092

Mean accuracy = 0.7517390150985053
Wall time: 23.6 s


#### 1.2 Results:

8 windows: acc = 0.7301

12 windows: acc = 0.7406

22 windows: acc = 0.7517

30 windows: acc = 0.7555 - best

32 windows: acc = 0.7558 - best

40 windows: acc = 0.7541

50 windows: acc = 0.74998

60 windows: acc = 0.7471

#### +- 0.05-0.1% accuracy compare to std only


### 1.3 Statistic feature - ptp

In [16]:
statistics_list_ptp = ["ptp_x", "ptp_y", "ptp_z"]

In [18]:
%%time
accuracy_list = []
for train_index, test_index in kf.split(patient_list):
    train_patient_list = [patient_list[i] for i in train_index]
    test_patient_list = [patient_list[i] for i in test_index]
    
    X_train, y_train = prepare_data.load_stat_features_others_windows(train_patient_list,
                                                                      data_path=data_path,
                                                                      statistics_list= statistics_list_ptp,
                                                                      n_others_windows=n_others_windows)
    X_test, y_test = prepare_data.load_stat_features_others_windows(test_patient_list,
                                                                    data_path=data_path, 
                                                                    statistics_list=statistics_list_ptp,
                                                                    n_others_windows=n_others_windows)
    
    model1 = LogisticRegression()
    model1.fit(X_train, y_train)
    
    y_predict = model1.predict(X_train)
    accuracy_train = metrics.accuracy_score(y_train, y_predict)
    print("\nAccuracy on train set: ", accuracy_train)
    
    y_predict = model1.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)
    print("Accuracy on test set: ", accuracy)
    
    #print(metrics.classification_report(y_test, y_predict))
    #print(test_patient_list)

print("\nMean accuracy =", np.mean(accuracy_list))    


Accuracy on train set:  0.7699826844662906
Accuracy on test set:  0.6873840445269017

Accuracy on train set:  0.763674009317306
Accuracy on test set:  0.7203811101905551

Accuracy on train set:  0.7518199499861072
Accuracy on test set:  0.7827014218009478

Accuracy on train set:  0.743051648475301
Accuracy on test set:  0.802521954514749

Accuracy on train set:  0.753553171170159
Accuracy on test set:  0.7621205256003625

Mean accuracy = 0.7510218113267031
Wall time: 14.5 s


#### 1.3 Results: 

8 windows: acc = 0.7346

12 windows: acc = 0.7438

22 windows: acc = 0.7510

30 windows: acc = 0.7521 - best

32 windows: acc = 0.7514

40 windows: acc = 0.7495

50 windows: acc = 0.7483

60 windows: acc = 0.7459

little worse than std

### 1.4 Statistic feature - std, ptp

In [19]:
statistics_list_std_ptp = ["std_x", "std_y", "std_z", "ptp_x", "ptp_y", "ptp_z"]

In [21]:
%%time
accuracy_list = []
for train_index, test_index in kf.split(patient_list):
    train_patient_list = [patient_list[i] for i in train_index]
    test_patient_list = [patient_list[i] for i in test_index]
    
    X_train, y_train = prepare_data.load_stat_features_others_windows(train_patient_list,
                                                                      data_path=data_path,
                                                                      statistics_list=statistics_list_std_ptp,
                                                                      n_others_windows=n_others_windows)
    X_test, y_test = prepare_data.load_stat_features_others_windows(test_patient_list,
                                                                    data_path=data_path, 
                                                                    statistics_list=statistics_list_std_ptp,
                                                                    n_others_windows=n_others_windows)
    
    model1 = LogisticRegression()
    model1.fit(X_train, y_train)
    
    y_predict = model1.predict(X_train)
    accuracy_train = metrics.accuracy_score(y_train, y_predict)
    print("\nAccuracy on train set: ", accuracy_train)
    
    y_predict = model1.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)
    print("Accuracy on test set: ", accuracy)
    
    #print(metrics.classification_report(y_test, y_predict))
    #print(test_patient_list)

print("\nMean accuracy =", np.mean(accuracy_list))    


Accuracy on train set:  0.7723286600011171
Accuracy on test set:  0.6975881261595547

Accuracy on train set:  0.763156381204348
Accuracy on test set:  0.7197597348798674

Accuracy on train set:  0.7562656293414838
Accuracy on test set:  0.7746445497630332

Accuracy on train set:  0.7485090581748621
Accuracy on test set:  0.7993695113713127

Accuracy on train set:  0.7598449525307567
Accuracy on test set:  0.7541912097870412

Mean accuracy = 0.7491106263921619
Wall time: 25.5 s


#### 1.4 Results: 

8 windows: acc = 0.7323

12 windows: acc = 0.7412

22 windows: acc = 0.7491

30 windows: acc = 0.7497

32 windows: acc = 0.7507  - best

40 windows: acc = 0.7471

50 windows: acc = 

60 windows: acc = 


## 2. SVM
### 2.1 Statictics features - std

In [49]:
%%time
accuracy_list = []
for train_index, test_index in kf.split(patient_list):
    train_patient_list = [patient_list[i] for i in train_index]
    test_patient_list = [patient_list[i] for i in test_index]
    
    X_train, y_train = prepare_data.load_stat_features_others_windows(train_patient_list,
                                                                      data_path=data_path,
                                                                      n_others_windows=n_others_windows)
    X_test, y_test = prepare_data.load_stat_features_others_windows(test_patient_list,
                                                                    data_path=data_path, 
                                                                    n_others_windows=n_others_windows)
    
    #print(X_test.shape)
    model2 = SVC(C=0.5, kernel="linear")
    model2.fit(X_train, y_train)
    
    y_predict = model2.predict(X_train)
    accuracy_train = metrics.accuracy_score(y_train, y_predict)
    print("\nAccuracy on train set: ", accuracy_train)
    
    y_predict = model2.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)
    print("Accuracy on test set: ", accuracy)
    
    #print(metrics.classification_report(y_test, y_predict))
    #print(test_patient_list)

print("\nMean accuracy =", np.mean(accuracy_list))    


Accuracy on train set:  0.7707067839909169
Accuracy on test set:  0.6908018867924528

Accuracy on train set:  0.7642552196034856
Accuracy on test set:  0.7268713204373423

Accuracy on train set:  0.7550686169311571
Accuracy on test set:  0.7606075216972035

Accuracy on train set:  0.7439094132448816
Accuracy on test set:  0.806134126802472

Accuracy on train set:  0.7530405984126077
Accuracy on test set:  0.7747581759557808

Mean accuracy = 0.7518346063370502
Wall time: 1h 9min 59s


#### Results:

8 windows: acc = 0.7262

12 windows: acc = 0.7237

22 windows: acc = 0.7506

30 windows: acc = 0.7518

32 windows: acc = 

40 windows: acc = 

50 windows: acc = 

60 windows: acc = 


## 3. Gradient Boosting Classifier
### 3.1 Statictics features - std

In [41]:
%%time
accuracy_list = []
for train_index, test_index in kf.split(patient_list):
    train_patient_list = [patient_list[i] for i in train_index]
    test_patient_list = [patient_list[i] for i in test_index]
    
    X_train, y_train = prepare_data.load_stat_features_others_windows(train_patient_list,
                                                                      data_path=data_path,
                                                                      n_others_windows=n_others_windows)
    X_test, y_test = prepare_data.load_stat_features_others_windows(test_patient_list,
                                                                    data_path=data_path, 
                                                                    n_others_windows=n_others_windows)
    
    #print(X_test.shape)
    model3 = GradientBoostingClassifier(n_estimators=50, max_depth=4)
    model3.fit(X_train, y_train)
    
    y_predict = model3.predict(X_train)
    accuracy_train = metrics.accuracy_score(y_train, y_predict)
    print("\nAccuracy on train set: ", accuracy_train)
    
    y_predict = model3.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_predict)
    accuracy_list.append(accuracy)
    print("Accuracy on test set: ", accuracy)
    
    #print(metrics.classification_report(y_test, y_predict))
    #print(test_patient_list)

print("\nMean accuracy =", np.mean(accuracy_list))    


Accuracy on train set:  0.7860701965723047
Accuracy on test set:  0.6694684234438891

Accuracy on train set:  0.773764579929002
Accuracy on test set:  0.7259048393655958

Accuracy on train set:  0.7659493326069191
Accuracy on test set:  0.760092807424594

Accuracy on train set:  0.7603396933936253
Accuracy on test set:  0.7905539615978813

Accuracy on train set:  0.767028247343208
Accuracy on test set:  0.7531083481349912

Mean accuracy = 0.7398256759933903
Wall time: 22.2 s


#### 30 windows, n_estimators=30, 35, 45, 50, 55, 60, 100, 80 max_depth=4
#### best n_estimators = 50
#### Results:

8 windows: acc = 0.7305

12 windows: acc = 0.7399

22 windows: acc = 0.7429

30 windows: acc = 0.7424, 0.7437, 0.7446, 0.7449 (n_est=50), 0.7445, 0.7443, 0.7433, 0.7437

32 windows: acc = 0.7452, time ~3 min   - best

40 windows: acc = 0.7430, time ~3-5 min

50 windows: acc = 

60 windows: acc = 


## 4. Simple RNN
### 4.1 Statistic features - std

In [6]:

from keras.layers import Dense, Flatten, Dropout, LSTM, Bidirectional
from keras.layers import Conv1D, MaxPooling1D
from keras.models import Sequential
from keras.optimizers import SGD, Adam
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2

from keras.layers import LSTM, Bidirectional

from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard


Using TensorFlow backend.


In [7]:
np.random.seed(5)

In [10]:
%%time
accuracy_list = []
for train_index, test_index in kf.split(patient_list):
    train_patient_list = [patient_list[i] for i in train_index]
    test_patient_list = [patient_list[i] for i in test_index]
    
    X_train, y_train = prepare_data.load_stat_features_others_windows_rnn(train_patient_list,
                                                                      data_path=data_path,
                                                                      n_others_windows=n_others_windows)
    X_test, y_test = prepare_data.load_stat_features_others_windows_rnn(test_patient_list,
                                                                    data_path=data_path, 
                                                                    n_others_windows=n_others_windows)
    


    RNN = Sequential()
    RNN.add(Bidirectional(LSTM(10, dropout=0.2, recurrent_dropout=0.2, input_shape=(n_others_windows + 1, 3) )))
    #RNN.add(LSTM(10, dropout=0.2, recurrent_dropout=0.2, input_shape=(n_others_windows + 1, 3) ))
    RNN.add(Dense(1, activation="sigmoid", kernel_initializer="glorot_uniform", kernel_regularizer=l2(0.01)))
    RNN.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    callbacks = [EarlyStopping(monitor='val_acc', patience=3)]
    RNN.fit(X_train, y_train,
            shuffle=True,
            batch_size=32, 
            epochs=12,
            validation_split=0.15,
            #validation_data=(X_test, y_test),
            verbose=1,
            callbacks=callbacks
           )
    
    scores = RNN.evaluate(X_test, y_test, verbose=0)
    accuracy = scores[1]
    print("\nAccuracy on test set: ", accuracy, "\n")
    accuracy_list.append(accuracy)

    
    #print(metrics.classification_report(y_test, y_predict))
    #print(test_patient_list)

print("\nMean accuracy =", np.mean(accuracy_list))    

Train on 14972 samples, validate on 2643 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12

Accuracy on test set:  0.6617924528301887 

Train on 14534 samples, validate on 2565 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12

Accuracy on test set:  0.7268713204373423 

Train on 15050 samples, validate on 2657 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12

Accuracy on test set:  0.7553037608486017 

Train on 14863 samples, validate on 2623 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12

Accuracy on test set:  0.8084229801006191 

Train on 14886 samples, validate on 2627 samples
Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12

Accuracy on test set:  0.755412252555515 


Mean accuracy = 0.7415605533544534
Wall time: 42min 34s


0.7437 - 16 win, bidirect 10 

0.7407- 16 win, bidirect es=1

0.7429 bidirect 16 es=2

0.74343 22w bidirect 10 es=2

0.74754 22w bidirect 10 6ep - best results

0.74156 30w bidirect 10  42 min

0.7392

0.6733   0.6774(7ep, 12lstm) 0.6742(8ep, 12lstm)
0.7265
0.7568
16 w

#### epochs = 6
#### Results:

8 windows: acc = 0.7361, time = 11 min; 0.7369 (9 epochs), 21 min; 0.7350 (4 ep), 0.7331 (7 epochs), 0.7334(10ep, 5lstm)
0.7333(3ep, 5lstm, no dropout), 0.7331(4ep, 5lstm, no dropout)

12 windows: acc = 0.7417, 0.7384(3ep, 5lstm, no dropout), 0.7393

16 windows: 0.7366(8ep, 12 lstm)

22 windows: acc = 0.7475, time = 16 min; 0.7445, 21 min

30 windows: acc = 0.7460 (9 epoch)

32 windows: acc = 

40 windows: acc = 

50 windows: acc = 

60 windows: acc = 


#### PTP, epochs = 9
#### Results:

8 windows: acc = 0.7340
 
12 windows: acc = 

22 windows: acc =

30 windows: acc = 

32 windows: acc = 

40 windows: acc = 

50 windows: acc = 

60 windows: acc = 
