<a href="https://colab.research.google.com/github/PetraB42/DSpracticum2020/blob/main/E_coli_promoters_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [None]:
!pip install biopython



In [None]:
import urllib.request
from pathlib import Path
from Bio import SeqIO
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

## Reshaping data from fasta to txt

In [None]:
classes = ['nonpromoters', 'promoters']
sets = ['train', 'valid']

for c in classes:
    for s in sets:
        urllib.request.urlretrieve(f"https://raw.githubusercontent.com/simecek/dspracticum2020/master/lecture_08/assignment/e_coli_promoters/e_coli_{c}_{s}.fa", f"e_coli_{c}_{s}.fa")

In [None]:
for c in classes:
    for s in sets:
        Path(f"data/{s}/{c}").mkdir(parents=True, exist_ok=True)

In [None]:
for c in classes:
    for s in sets:
        with open(f"e_coli_{c}_{s}.fa", "r") as fr:
            for record in SeqIO.parse(fr, "fasta"):
                id = record.id
                with open(f"data/{s}/{c}/{id}.txt", "w") as fw:
                    fw.writelines([" ".join(str(record.seq))])


## Reading data

In [None]:
batch_size = 128

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/train/',
    batch_size=batch_size,
    class_names=classes)

Found 6791 files belonging to 2 classes.


In [None]:
raw_valid_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/valid/',
    batch_size=batch_size,
    class_names=classes)

Found 750 files belonging to 2 classes.


In [None]:
vectorize_layer = TextVectorization(output_mode='int')

train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)
vectorize_layer.set_vocabulary(vocab=np.asarray(['a', 'c', 't', 'g', 'n']))

In [None]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text)-2, label

train_ds = raw_train_ds.map(vectorize_text)
valid_ds = raw_valid_ds.map(vectorize_text)

## Model training

In [None]:
# one-hot encoding
onehot_layer = keras.layers.Lambda(lambda x: tf.one_hot(tf.cast(x,'int64'), 4))

In [None]:
# best accuracy of model_lstm fitting:

#model_lstm = tf.keras.Sequential([
    #keras.layers.Embedding(batch_size, 150, input_length = 150),
    #keras.layers.LSTM(128, return_sequences= True),
    #keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    #keras.layers.Dense(1, activation="sigmoid")]) 

#model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# best final accuracy:

model_lstm = tf.keras.Sequential([
    keras.layers.Embedding(batch_size, 150, input_length = 150),
    keras.layers.LSTM(128, return_sequences= True),
    keras.layers.LSTM(128, return_sequences= False),
    keras.layers.Dense(2, activation="tanh")]) 

model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
epochs = 10
history = model_lstm.fit(
    train_ds,
    epochs=epochs,
    validation_data = valid_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model_cnn = tf.keras.Sequential([
    onehot_layer,
    keras.layers.Conv1D(32, kernel_size=6, data_format='channels_last', activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling1D(),
    keras.layers.Conv1D(16, kernel_size=6, data_format='channels_last', activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling1D(),
    keras.layers.Conv1D(4, kernel_size=6, data_format='channels_last', activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling1D(),
    keras.layers.Dropout(0.3),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(1, activation="sigmoid")
])

model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
epochs = 10

history = model_cnn.fit(
    train_ds,
    epochs=epochs,
    validation_data = valid_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
