# Autoencoder

In [1]:
from keras.models import Model
from keras.layers import Dense, Input
import numpy as np
import mne
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import h5py

Using TensorFlow backend.


In [7]:
batch_size = 256
epoch_num = 7

channels_num = 61
encoding_dim_vars = [50,40,35,30,25,20,15]

all_eeg_dir = "resting_state/"
all_eeg_names = [x for x in os.listdir(all_eeg_dir) 
                 if x[-5:] == ".vhdr"]
train_eeg_names = all_eeg_names[:3]
print("Dataset contains {} EEG.".format(len(all_eeg_names)))

data = np.zeros((0,channels_num))

for eeg_name in train_eeg_names:
    eeg_data = mne.io.read_raw_brainvision(all_eeg_dir + eeg_name, preload=True).get_data().T
    data = np.append(data, eeg_data, axis = 0)

Dataset contains 32 EEG.
Extracting parameters from resting_state/gorin_310117_rest_eeg.vhdr...
Setting channel info structure...
Reading 0 ... 609999  =      0.000 ...   609.999 secs...
Extracting parameters from resting_state/miloslavov_22_05_pre_eeg.vhdr...
Setting channel info structure...
Reading 0 ... 603399  =      0.000 ...   603.399 secs...
Extracting parameters from resting_state/gorin_rest_eeg_post_31011200.vhdr...
Setting channel info structure...
Reading 0 ... 603699  =      0.000 ...   603.699 secs...


In [3]:
data.shape

(1817100, 61)

### Data Preprocessing and Train-Test-Split 

In [4]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

In [5]:
X_train, X_test, _, _ = train_test_split(scaled_data, scaled_data, test_size=0.1, random_state = 123)

print(X_train.shape)
print(X_test.shape)

(1635390, 61)
(181710, 61)


### Autoencoder with one hidden layer

In [2]:
def create_ae(encoding_dim = 30):
    input_data = Input(shape=(channels_num,))
    encoded = Dense(encoding_dim, activation='relu')(input_data)

    input_encoded = Input(shape=(encoding_dim,))
    decoded = Dense(channels_num, activation='sigmoid')(input_encoded)

    encoder = Model(input_data, encoded, name="encoder")
    decoder = Model(input_encoded, decoded, name="decoder")

    autoencoder = Model(input_data, decoder(encoder(input_data)), name="autoencoder")

    autoencoder.compile(optimizer='adam', loss='mean_squared_error')
    #autoencoder.summary()
    return encoder, decoder, autoencoder

In [7]:
score_res = []

for enc_dim in encoding_dim_vars:
    encoder, decoder, autoencoder = create_ae(enc_dim)
    hist = autoencoder.fit(X_train, X_train,
                           epochs=epoch_num,
                           batch_size = batch_size,
                           verbose=1, #Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.
                           validation_data=(X_test, X_test))
    cur_score = (1 + hist.history['val_loss'][-1]) * enc_dim/channels_num
    score_res.append(cur_score)
    print("Score for {} encoding dims: {}".format(enc_dim, cur_score))

Train on 1635390 samples, validate on 181710 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Score for 50 encoding dims: 1.2954696334815263
Train on 1635390 samples, validate on 181710 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Score for 40 encoding dims: 1.036373419705608
Train on 1635390 samples, validate on 181710 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Score for 35 encoding dims: 0.906815053485011
Train on 1635390 samples, validate on 181710 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Score for 30 encoding dims: 0.777283570259299
Train on 1635390 samples, validate on 181710 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Score for 25 encoding dims: 0.6479065991363896
Train on 1635390 samples, validate on 181710 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
Score for 20 encoding dims: 0.

In [8]:
score_res

[1.2954696334815263,
 1.036373419705608,
 0.90681505348501101,
 0.77728357025929895,
 0.64790659913638959,
 0.51820092454094935,
 0.38866106105353382]

#### Вывод:
Для метрики оценивания наиболее оптимальным вариантом является автоэнкодер с 15 скрытыми нейронами(так как на оценку очень сильно влияет уменьшение размерности). В то же время автоэнкодеры с большим количеством скрытых нейронов показывают лучшее MSE. 

Обучим модель с 15 скрытыми нейронами на большем количестве ЭЭГ.

In [8]:
for eeg_name in all_eeg_names[3:8]:
    eeg_data = mne.io.read_raw_brainvision(all_eeg_dir + eeg_name, preload=True).get_data().T
    data = np.append(data, eeg_data, axis = 0)

Extracting parameters from resting_state/zavrin_open_eyes_eeg_15021500.vhdr...
Setting channel info structure...
Reading 0 ... 301999  =      0.000 ...   301.999 secs...
Extracting parameters from resting_state/gorbacheva_03021300_rest_eeg.vhdr...
Setting channel info structure...
Reading 0 ... 625249  =      0.000 ...   625.249 secs...
Extracting parameters from resting_state/glebko_2103_pre_eeg.vhdr...
Setting channel info structure...
Reading 0 ... 643749  =      0.000 ...   643.749 secs...
Extracting parameters from resting_state/zavrin_15021500_eyesclosed_post_eeg.vhdr...
Setting channel info structure...
Reading 0 ... 319299  =      0.000 ...   319.299 secs...
Extracting parameters from resting_state/2103_kozunova_post_eeg.vhdr...
Setting channel info structure...
Reading 0 ... 603199  =      0.000 ...   603.199 secs...


In [9]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

X_train, X_test, _, _ = train_test_split(scaled_data, scaled_data, test_size=0.1, random_state = 123)

print(X_train.shape)
print(X_test.shape)

(3879540, 61)
(431060, 61)


In [10]:
encoder, decoder, autoencoder = create_ae(15)

autoencoder.fit(X_train, X_train,
                epochs=epoch_num,
                batch_size = batch_size,
                verbose=1, #Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.
                validation_data=(X_test, X_test))

Train on 3879540 samples, validate on 431060 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x130294b38>

In [11]:
encoder.save('encoder.p')

In [12]:
decoder.save('decoder.p')

Possible experiments/improvments:

* encoding dim
* optimizer function
* loss function
* more training  
* batch size