In [2]:
import numpy as np
import pandas as pd
import gensim
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences


from word2vec import classification_tools

Using TensorFlow backend.


In [3]:
RANDOM_STATE = 42

model_path = '/home/pierre/riken/word2vec/prot_vec_model.model'
data_path = '/home/pierre/riken/data/riken_data/complete_from_xlsx.tsv'

As input we want list containing indexes.

# Data import and preprocessing

In [4]:
df = pd.read_csv(data_path, sep='\t')
df.loc[:, 'seq_len'] = df.sequences.apply(len)
df = df.loc[df.seq_len >= 50, :]

X, y = df['sequences'].values, df['is_allergenic'].values
X_tokens = classification_tools.ProteinTokenizer(token_size=3).transform(X)
lenghts = df.seq_len.values

In [30]:
model = gensim.models.word2vec.Word2Vec.load(model_path)



In [31]:
labels_idx = {'_UNKNOWN': 0}
embeddings = []

for sentence in X_tokens:
    sentence_embed = []
    for token in sentence:
        if token not in model.wv:
            token = '_UNKNOWN'
        if token not in labels_idx:
            labels_idx[token] = len(labels_idx)
        idx = labels_idx[token]
        sentence_embed.append(idx)
    embeddings.append(sentence_embed)

matrix_embeddings = np.zeros((len(labels_idx), 100))
labels_idx.pop('_UNKNOWN', None)
matrix_embeddings[1:] = model.wv[labels_idx.keys()]

embeddings = pad_sequences(embeddings, maxlen=300)

In [32]:
Xtrain, Xtest, ytrain, ytest = train_test_split(embeddings, y, test_size=0.3)

# Model Architecture

In [41]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import Adam
import keras.metrics

from keras.callbacks import TensorBoard, ModelCheckpoint

In [42]:
lstm_nn = Sequential()
lstm_nn.add(Embedding(len(matrix_embeddings), output_dim=100, weights=[matrix_embeddings], trainable=False))
lstm_nn.add(LSTM(10, return_sequences=True))
lstm_nn.add(Dropout(0.5))
lstm_nn.add(LSTM(10))
lstm_nn.add(Dense(1, activation='sigmoid'))

lstm_nn.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [43]:
lstm_nn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 100)         817400    
_________________________________________________________________
lstm_13 (LSTM)               (None, None, 10)          4440      
_________________________________________________________________
dropout_7 (Dropout)          (None, None, 10)          0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 10)                840       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 822,691
Trainable params: 5,291
Non-trainable params: 817,400
_________________________________________________________________


In [44]:
tb = TensorBoard(log_dir='./logs4')
ckpt = ModelCheckpoint(filepath='./logs4')
lstm_nn.fit(Xtrain, ytrain, batch_size=128, epochs=49, validation_data=(Xtest, ytest), 
          callbacks=[tb]
         )

Train on 8729 samples, validate on 3741 samples
Epoch 1/49
Epoch 2/49
Epoch 3/49
Epoch 4/49
Epoch 5/49
Epoch 6/49
Epoch 7/49
Epoch 8/49
Epoch 9/49
Epoch 10/49
Epoch 11/49
Epoch 12/49
Epoch 13/49
Epoch 14/49
Epoch 15/49
Epoch 16/49
Epoch 17/49
Epoch 18/49
Epoch 19/49
Epoch 20/49
Epoch 21/49
Epoch 22/49
Epoch 23/49
Epoch 24/49
Epoch 25/49
Epoch 26/49
Epoch 27/49
Epoch 28/49
Epoch 29/49
Epoch 30/49
Epoch 31/49
Epoch 32/49
Epoch 33/49
Epoch 34/49
Epoch 35/49
Epoch 36/49
Epoch 37/49
Epoch 38/49
Epoch 39/49
Epoch 40/49
Epoch 41/49
Epoch 42/49
Epoch 43/49
Epoch 44/49
Epoch 45/49
Epoch 46/49
Epoch 47/49
Epoch 48/49
Epoch 49/49


<keras.callbacks.History at 0x7fb708ec7d30>

In [45]:
!ls logs

ls: cannot access 'logs': No such file or directory


In [47]:
ypred = lstm_nn.predict_proba(Xtest)

In [51]:
from sklearn.metrics import classification_report, roc_auc_score

print(classification_report(ytest, ypred >= 0.5))

print('ROC AUC SCORE: ', roc_auc_score(ytest, ypred))

             precision    recall  f1-score   support

      False       0.89      0.98      0.94      3148
       True       0.81      0.37      0.51       593

avg / total       0.88      0.89      0.87      3741

ROC AUC SCORE:  0.7809005316151372
