# Import data

In [29]:
import pandas as pd

df_nonB = pd.read_csv('./data/genomes_nonB.csv')
df_nonB.shape

(103476, 6)

In [30]:
df_nonB.head()

Unnamed: 0.1,Unnamed: 0,names,subtypes,seqs,len_seqs,recomb
0,0,AY529677,C,atgagagtgatggggatacagaggaattgtcaacagtggtggatat...,2613,0
1,1,HQ595761,C,atgagagtgacggggatacggaagaattgtcaacaatggtggatat...,2535,0
2,2,HQ595746,C,atgagagtgatggggataacgaggaattgtcaacaatggtggatat...,2517,0
3,3,HM623585,C,atgagagtgagggggacatggaggaattatccacaatggtggatat...,2541,0
4,4,HM623566,C,atgagagtgaaggggatgcagaggaattgtccactatggtggatat...,2538,0


# Convert genomes to trigrams

In [31]:
sequences = df_nonB['seqs'].values

In [32]:
from nltk import ngrams
import numpy as np

corpus = []
for seq in sequences:
    in_tri = np.array([''.join(i) for i in ngrams(seq, 3)])
    corpus.append(in_tri)

In [33]:
sentences = [' '.join(x) for x in corpus]

y = df_nonB['recomb'].values

# Train test split

In [34]:
from sklearn.model_selection import train_test_split

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences,
                                                y, test_size=0.25,
                                                random_state=100, stratify=y)

# Tokenize the ngrams

In [35]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

# Padding to max 3000

In [36]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 3000

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0, :])

[54 23 40 ...  0  0  0]


# Models

In [37]:
import keras

METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.AUC(name='auc')]

In [38]:
import tensorflow as tf

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True)

## Baseline model

In [39]:
from keras.models import Sequential
from keras import layers

embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[METRICS])

model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 3000, 50)          83250     
_________________________________________________________________
flatten_3 (Flatten)          (None, 150000)            0         
_________________________________________________________________
dense_13 (Dense)             (None, 10)                1500010   
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 11        
Total params: 1,583,271
Trainable params: 1,583,271
Non-trainable params: 0
_________________________________________________________________


In [40]:
history = model.fit(X_train, y_train,
                    epochs=30,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    callbacks = [early_stopping],
                    batch_size=256)

loss, accuracy, auc = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
print("Training AUC: {:.4f}".format(auc))
loss, accuracy, auc = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
print("Testing AUC:  {:.4f}".format(auc))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 77607 samples, validate on 25869 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Restoring model weights from the end of the best epoch.
Epoch 00008: early stopping
Training Accuracy: 0.9945
Training AUC: 0.9994
Testing Accuracy:  0.9795
Testing AUC:  0.9968


## Add a conv1d

In [47]:
embedding_dim = 50

METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.AUC(name='auc')]



model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 3000, 50)          83250     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 2996, 128)         32128     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 128)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 10)                1290      
_________________________________________________________________
dense_22 (Dense)             (None, 1)                 11        
Total params: 116,679
Trainable params: 116,679
Non-trainable params: 0
_________________________________________________________________


In [48]:
history = model.fit(X_train, y_train,
                    epochs=30,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    callbacks = [early_stopping],
                    batch_size=256)

loss, accuracy, auc = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
print("Training AUC: {:.4f}".format(auc))
loss, accuracy, auc = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
print("Testing AUC:  {:.4f}".format(auc))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 77607 samples, validate on 25869 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Restoring model weights from the end of the best epoch.
Epoch 00015: early stopping
Training Accuracy: 0.9944
Training AUC: 0.9996
Testing Accuracy:  0.9894
Testing AUC:  0.9987


In [49]:
from sklearn.metrics import classification_report

y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     15034
           1       0.99      0.99      0.99     10835

    accuracy                           0.99     25869
   macro avg       0.99      0.99      0.99     25869
weighted avg       0.99      0.99      0.99     25869

