In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random
import tensorflow.keras.backend as K
import numpy as np
import matplotlib.pyplot as plt 
from tensorflow.keras import regularizers
from timeit import default_timer as timer
#%matplotlib widget

In [2]:
## check that GPU available
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
### Hyperparams problem
number_of_different_chars=5
sentence_length=5
batch_size=64 #how many episodes for one training step (=parameter update)?

epoch_size=1 #how many training steps to count as 1 epoch

In [4]:
### create dataset: random strings 

#function to create one random string with chars encoded as one-hot vectors. 
#probably there is a more efficient way to do this?
def create_random_string(number_of_different_chars, sentence_length):
    #random_string=[tf.one_hot(tf.random.uniform(shape=(), minval=0, maxval=number_of_different_chars, dtype=tf.int32), number_of_different_chars) for i in range(sentence_length)]
    random_string=[tf.random.uniform(shape=(), minval=0, maxval=number_of_different_chars, dtype=tf.int32) for i in range(sentence_length)]
    print(random_string)
    return tf.convert_to_tensor(random_string)

# https://stackoverflow.com/questions/47318734/on-the-fly-generation-with-dataset-api-tensorflow
dummy_dataset = tf.data.Dataset.from_tensors(0).repeat(batch_size * epoch_size)
dataset = dummy_dataset.map(lambda _: create_random_string(number_of_different_chars, sentence_length))
dataset = dataset.batch(batch_size)

[<tf.Tensor 'random_uniform:0' shape=() dtype=int32>, <tf.Tensor 'random_uniform_1:0' shape=() dtype=int32>, <tf.Tensor 'random_uniform_2:0' shape=() dtype=int32>, <tf.Tensor 'random_uniform_3:0' shape=() dtype=int32>, <tf.Tensor 'random_uniform_4:0' shape=() dtype=int32>]


In [5]:
### display some examples from the dataset to check
i=0
for element in dataset:
    print(element[:7])   
    i+=1
    if i == 2:
        break

tf.Tensor(
[[3 2 2 0 4]
 [3 4 2 1 4]
 [2 1 4 2 1]
 [3 4 2 4 3]
 [4 3 0 3 0]
 [3 2 0 1 3]
 [3 1 3 4 2]], shape=(7, 5), dtype=int32)


In [6]:
### Hyperparams neural net
#intermediate_dim=64

In [7]:
### create the neural net (NN1) of agent1 

NN1_input = keras.Input((sentence_length))
e = keras.layers.Embedding(number_of_different_chars, number_of_different_chars)(NN1_input)
NN1_output = keras.layers.Conv1D(number_of_different_chars, (1))(e) #returns the logits!! no softmax

NN1 = keras.Model(NN1_input, NN1_output)
NN1.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 5, 5)              25        
_________________________________________________________________
conv1d (Conv1D)              (None, 5, 5)              30        
Total params: 55
Trainable params: 55
Non-trainable params: 0
_________________________________________________________________


In [8]:
### define reward-function, here: (sentence_length - HammingDistance)/sentence_length
def compute_rewards(predicted_string, correct_string):
    #check where predicted_string[i]=correct_string[i]
    character_matches=tf.math.equal(predicted_string, correct_string)
    #convert True, False to 1, 0
    character_matches_as_ints=tf.cast(character_matches, tf.float32)
    #sum to get the reward
    reward=tf.math.reduce_sum(character_matches_as_ints, axis=-1)
    #divide by sentence_length for correct scaling
    return reward/sentence_length

In [9]:
### test reward function
a=tf.constant([[1, 0, 0, 1, 1], [1, 0, 0, 1, 1], [1, 0, 0, 1, 1]], dtype=tf.int32)
b=tf.constant([[0, 1, 0, 1, 0], [1, 0, 0, 1, 1], [1, 0, 0, 1, 0]], dtype=tf.int32)
compute_rewards(a,b)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.4, 1. , 0.8], dtype=float32)>

In [10]:
optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.0) #, clipnorm=1

In [11]:
def custom_loss(agent_output, probability_logits, rewards):
    agent_output_one_hot=tf.one_hot(agent_output, number_of_different_chars)
    log_lik = agent_output_one_hot*probability_logits
    return tf.math.reduce_sum(-log_lik*rewards[:, np.newaxis, np.newaxis])

In [12]:
### define a train step (for a single agent)
def train_step(input_batch, NN1):
    with tf.GradientTape() as tape:
        chararacter_probabilites_logits = NN1(input_batch, training=True) #
        
        ## sample from the probabilities to decide what action the agent chose
        # https://stackoverflow.com/questions/39432164/sample-from-a-tensor-in-tensorflow-along-an-axis
        dims = chararacter_probabilites_logits.get_shape().as_list() 
        #(dims = [batch_size, sentence_length, number_of_different_chars])
        N = dims[-1]
        logits = tf.reshape(chararacter_probabilites_logits, [-1, N])
        samples = tf.random.categorical(logits, 1, dtype=tf.int32)
        agent1_output = tf.reshape(samples, dims[:-1])
        

        rewards = compute_rewards(agent1_output, input_batch)  
        ## normalize rewards
        mean = tf.math.reduce_mean(rewards)
        std = tf.math.reduce_std(rewards) if tf.math.reduce_std(rewards) > 0 else 1.
        scaled_rewards = (rewards-mean) / std
        
        loss = custom_loss(agent1_output, chararacter_probabilites_logits, scaled_rewards)

    #retrieve gradients
    grads = tape.gradient(loss, NN1.trainable_weights)
    
    #perform a parameter update
    optimizer.apply_gradients(zip(grads, NN1.trainable_weights))
    
    return rewards

In [13]:
### Training loops
epochs = 10
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    loss_count, epoch_average = 0, 0
    
    for step, input_batch in enumerate(dataset):
        rewards = train_step(input_batch, NN1)
        print("Training reward (for one batch): ", np.mean(rewards))


Start of epoch 0
Training reward (for one batch):  0.15937501

Start of epoch 1
Training reward (for one batch):  0.22500002

Start of epoch 2
Training reward (for one batch):  0.23437501

Start of epoch 3
Training reward (for one batch):  0.33125

Start of epoch 4
Training reward (for one batch):  0.37187505

Start of epoch 5
Training reward (for one batch):  0.421875

Start of epoch 6
Training reward (for one batch):  0.47812504

Start of epoch 7
Training reward (for one batch):  0.665625

Start of epoch 8
Training reward (for one batch):  0.75

Start of epoch 9
Training reward (for one batch):  0.8156251


In [14]:
def compare_input_and_output(input_batch, neural_net, number_of_examples):
    for i in range(number_of_examples):
        print(list(input_batch[i].numpy()), "\t", np.argmax(neural_net(np.array([input_batch[i]])), axis=-1)[0])

In [15]:
compare_input_and_output(input_batch, NN1, 5)

[1, 4, 4, 2, 4] 	 [1 4 4 2 4]
[0, 2, 4, 1, 1] 	 [0 2 4 1 1]
[0, 2, 1, 4, 4] 	 [0 2 1 4 4]
[4, 1, 0, 3, 0] 	 [4 1 0 3 0]
[1, 0, 0, 0, 3] 	 [1 0 0 0 3]


In [16]:
# Computes the average over n samples: how many epochs did it take to reach a reward >= threshold once?
def estimate_average_solving_time(n, threshold=1.):
    sum_of_epochs=0
    for _ in range(n):
        ## create a new neural network
        NN_input = keras.Input((sentence_length))
        e = keras.layers.Embedding(number_of_different_chars, number_of_different_chars)(NN_input)
        NN_output = keras.layers.Conv1D(number_of_different_chars, (1))(e) #returns the logits!! no softmax
        NN = keras.Model(NN_input, NN_output)

        ## train neural network and stop when reward >= threshold
        epoch = 0
        reward = 0
        while reward < threshold:    
            for step, input_batch in enumerate(dataset):
                rewards = train_step(input_batch, NN)
                reward = np.mean(rewards)       
            epoch+=1

        sum_of_epochs+=epoch
        print(epoch)    
    return sum_of_epochs / n

In [17]:
estimate_average_solving_time(50)

20
21
26
18
26
21
19
27
22
24
20
18
17
51
24
18
21
20
21
22
22
21
21
25
17
22
21
27
26
21
24
24
28
22
23
21
25
24
30
25
18
17
21
20
21
23
18
20
26
22


22.62