In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds

# 1. Data set

In [2]:
def onehotify(input, label):
    vocab = {'A':'1', 'C': '2', 'G':'3', 'T':'0'}
    for key in vocab.keys():
        input = tf.strings.regex_replace(input, key, vocab[key])
    split = tf.strings.bytes_split(input)
    labels = tf.cast(tf.strings.to_number(split), tf.uint8)
    onehot = tf.one_hot(labels, 4)
    onehot = tf.reshape(onehot, (-1,))
    # slight modification to the given function to separately encode input and label data
    return onehot, tf.one_hot(label, 10)


In [78]:
# load data
# 100,000 training/ 1000 test
train_dataset = tfds.load('genomics_ood', split='train[:10%]', as_supervised=True, shuffle_files=True) 
test_dataset = tfds.load('genomics_ood', split='test[:1%]', as_supervised=True, shuffle_files=True) 
# data = data.prefetch(1000)

# one-hot encoding
train_dataset = train_dataset.map(onehotify)
test_dataset = test_dataset.map(onehotify)

# batching
train_dataset = train_dataset.batch(5000)  # whole training data covered in 10 epochs
test_dataset = test_dataset.batch(1)  # test each data point


#  
# train_data = data.take(100000)
# test_data = data.skip(100000)

# .shuffle(100000).batch(100).prefetch(2) # in total 1000 update steps
# .shuffle(1000).prefetch(100)

# 2. Model

In [77]:
model = tf.keras.models.Sequential()
model.add(tf.keras.Input(shape=(1000,)))  # one-hot encoded gene seq have len 1000
model.add(tf.keras.layers.Dense(256, activation='sigmoid'))
model.add(tf.keras.layers.Dense(256, activation='sigmoid'))
model.add(tf.keras.layers.Dense(10, activation='softmax'))


# from tensorflow.keras import Model

# class Model(Model): 
    
#     def __init__(self):
#         super(Model, self).__init__()
#         # Define the three layers.
#         self.hidden_layer_1 = tf.keras.layers.Dense(256, activation='sigmoid')
#         self.hidden_layer_2 = tf.keras.layers.Dense(256, activation='sigmoid')
#         self.output_layer = tf.keras.layers.Dense(10, activation='softmax')
        
#     def call(self, x):
#         # Define the forward step.
#         x = self.hidden_layer_1(x)
#         x = self.hidden_layer_2(x)
#         x = self.output_layer(x)
#         return x

# 3. Training

In [69]:
def train_step(model, input, target, loss_function, optimizer):
    with tf.GradientTape() as tape:
        prediction = model(input)  # for each sample in batch, probability of assigning to each class
        loss = loss_function(target, prediction)
        gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    return loss 


def test(model, test_data, loss_function):
    test_accuracy_aggregator = []
    test_loss_aggregator = []

    for (input, target) in test_data:
        prediction = model(input)
        sample_test_loss = loss_function(target, prediction)
        # simple winner-take-all binary accuracy for each sample
        sample_test_accuracy =  np.argmax(target, axis=1) == np.argmax(prediction, axis=1)
        test_loss_aggregator.append(sample_test_loss.numpy())
        test_accuracy_aggregator.append(np.mean(sample_test_accuracy))

    test_loss = np.mean(test_loss_aggregator)
    test_accuracy = np.mean(test_accuracy_aggregator)

    return test_loss, test_accuracy


In [79]:
tf.keras.backend.clear_session()

### Hyperparameters
num_epochs = 20
learning_rate = 0.1
running_average_factor = 0.95

# design of network (model is prev. defined)
# model = Model()
loss_f = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.SGD(learning_rate)

# Initialize lists for later visualization.
train_losses = []
test_losses = []
test_accuracies = []

#testing once before we begin
test_loss, test_accuracy = test(model, test_dataset, loss_f)
test_losses.append(test_loss)
test_accuracies.append(test_accuracy)

#check how model performs on train data once before we begin
train_loss, _ = test(model, train_dataset, loss_f)
train_losses.append(train_loss)

# We train for num_epochs epochs.
for epoch in range(num_epochs):
    print('Epoch: __ ' + str(epoch))

    # training (and checking in with training)
    running_average = 0
    for (input,target) in train_dataset:
        train_loss = train_step(model, input, target, loss_f, optimizer)
        running_average = running_average_factor * running_average  + (1 - running_average_factor) * train_loss
    train_losses.append(running_average)

    #testing
    test_loss, test_accuracy = test(model, test_dataset, loss_f)
    test_losses.append(test_loss)
    test_accuracies.append(test_accuracy)

Epoch: __ 0
Epoch: __ 1
Epoch: __ 2
Epoch: __ 3
Epoch: __ 4
Epoch: __ 5
Epoch: __ 6
Epoch: __ 7
Epoch: __ 8
Epoch: __ 9
Epoch: __ 10
Epoch: __ 11
Epoch: __ 12
Epoch: __ 13
Epoch: __ 14
Epoch: __ 15
Epoch: __ 16
Epoch: __ 17
Epoch: __ 18
Epoch: __ 19


In [80]:
test_accuracies

[0.1,
 0.124,
 0.153,
 0.178,
 0.189,
 0.197,
 0.194,
 0.198,
 0.198,
 0.199,
 0.211,
 0.229,
 0.255,
 0.271,
 0.287,
 0.295,
 0.309,
 0.313,
 0.314,
 0.321,
 0.324]