In [None]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds

from tensorflow.keras import Model
from tensorflow.keras import layers

import matplotlib.pyplot as plt

In [None]:
translation = {'A': '0', 'T': '1', 'G': '2', 'C': '3'}

def preprocessData(seq):
    # used as map function
    for base, number in translation.items():
        seq = tf.strings.regex_replace(seq, base, number)
    seq = tf.strings.bytes_split(seq)
    seq = tf.strings.to_number(seq, out_type=tf.dtypes.int32)
    seq = tf.one_hot(seq, 4, dtype=tf.uint8)
    seq = tf.reshape(seq, (-1,))

    return seq
   
def preprocessTarget(tensor):
    target = tf.cast(tensor, tf.uint8)
    target = tf.cast(tf.one_hot(target, 10), tf.uint8)
    target = tf.reshape(target, (-1,))
    
    return target

# loading the dataset and preprocess the data
train_samples = 100000
test_samples = 1000

train_ds, test_ds = tfds.load('genomics_ood', 
                              split = [f'train[:{train_samples}]',f'test[:{test_samples}]'], 
                              as_supervised = True)

train_ds = train_ds.map(lambda data, target: (preprocessData(data), preprocessTarget(target)))
test_ds = test_ds.map(lambda data, target: (preprocessData(data), preprocessTarget(target)))

train_ds = train_ds.batch(1000).shuffle(buffer_size=128).prefetch(1)
test_ds = test_ds.batch(100).shuffle(buffer_size=128).prefetch(1)

In [None]:
class Model(Model): 
    def __init__(self):
        super(Model, self).__init__()
        # Define the hidden layer
        self.hidden_layer_1 = layers.Dense(units=256, 
                                           activation=tf.keras.activations.sigmoid)
        
        self.hidden_layer_2 = layers.Dense(units=256, 
                                           activation=tf.keras.activations.sigmoid)
        
        self.output_layer = layers.Dense(units=10, 
                                         activation=tf.keras.activations.softmax, 
                                         use_bias=False)
                                               
    def call(self, x):
        # Define the forward step.
        x = self.hidden_layer_1(x)
        x = self.hidden_layer_2(x)
        x = self.output_layer(x)
        return x
    
def train_step(model, input, target, loss_function, optimizer):
    # loss_object and optimizer_object are instances of respective tensorflow classes
    with tf.GradientTape() as tape:
        prediction = model(input)
        loss = loss_function(target, prediction)
        gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss 

def test(model, test_data, loss_function):
    # test over complete test data

    test_loss_aggregator = []

    for (input, target) in test_data:
        prediction = model(input)
        sample_test_loss = loss_function(target, prediction)
        test_loss_aggregator.append(sample_test_loss.numpy())

    test_loss = np.mean(test_loss_aggregator)

    return test_loss

In [None]:
# initialize the model and specify parameters

tf.keras.backend.clear_session()

### Hyperparameters
num_epochs = 250
learning_rate = 0.01
running_average_factor = 0.95

# Initialize the model.
model = Model()
# Initialize the loss: categorical cross entropy. Check out 'tf.keras.losses'.
cce = tf.keras.losses.CategoricalCrossentropy()
# Initialize the optimizer: Adam with default parameters. Check out 'tf.keras.optimizers'
optimizer = tf.keras.optimizers.SGD(learning_rate)

# Initialize lists for later visualization.
train_losses = []
test_losses = []

print('Start testing')

#testing once before we begin
test_loss = test(model, test_ds, cce)
test_losses.append(test_loss)

print('Finished test testing')

#check how model performs on train data once before we begin
train_loss = test(model, train_ds, cce)
train_losses.append(train_loss)

print('Finished train testing')

#---------------------------------------------------------------------------
# training of the network
print('Start training')

for epoch in range(num_epochs):
    print(f'Epoch: {epoch} ')

#     train_ds = train_ds.shuffle(buffer_size=128)
#     test_ds = test_ds.shuffle(buffer_size=128)

    #training (and checking in with training)
    running_average = 0
    for (input, target) in train_ds:
        train_loss = train_step(model, input, target, cce, optimizer)
        running_average = running_average_factor * running_average  + (1 - running_average_factor) * train_loss
    train_losses.append(running_average)

    #testing
    test_loss = test(model, test_ds, cce)
    test_losses.append(test_loss)

In [None]:
# Visualize accuracy and loss for training and test data. 
# One plot training and test loss.
# One plot training and test accuracy.

plt.figure()
line1, = plt.plot(train_losses)
line2, = plt.plot(test_losses)
plt.xlabel("Training steps")
plt.ylabel("Loss")
plt.legend((line1,line2),("training","test"))
plt.show()

In [None]:
def preprocess_2(data):
    
    dataset = None
    labels = None
    
    for seq, label in data:
        for base, number in translation.items():

            seq = tf.strings.regex_replace(seq, base, number)
        seq = tf.strings.bytes_split(seq)
        seq = tf.strings.to_number(seq, out_type=tf.dtypes.int32)
        seq = tf.cast(tf.one_hot(seq, 4), tf.uint8)
        seq = tf.reshape(seq, (-1))
        seq = tf.expand_dims(seq, 0)
        
        label = tf.cast(label, tf.uint8)
        label = tf.expand_dims(label, 0)

        if dataset is None:
            dataset = seq
            labels = label 
        else:
            dataset = tf.concat([dataset, seq], 0)
            labels = tf.concat([labels, label], 0)

        
    dataset = tf.data.Dataset.from_tensor_slices(dataset)
    labels = tf.data.Dataset.from_tensor_slices(labels)
        
    return tf.data.Dataset.zip((dataset, labels))