In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random
import tensorflow.keras.backend as K
import numpy as np
import matplotlib.pyplot as plt 
from tensorflow.keras import regularizers
from timeit import default_timer as timer
#%matplotlib widget

In [2]:
## check that GPU available
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
### Hyperparams problem
number_of_different_chars=3
number_of_different_intermediate_chars=10 # (!!)
sentence_length=5
batch_size=64 #how many episodes for one training step (=parameter update)?

epoch_size=1 #how many training steps to count as 1 epoch

In [4]:
### create dataset: random strings 

#function to create one random string with chars encoded as one-hot vectors. 
#probably there is a more efficient way to do this?
def create_random_string(number_of_different_chars, sentence_length):
    #random_string=[tf.one_hot(tf.random.uniform(shape=(), minval=0, maxval=number_of_different_chars, dtype=tf.int32), number_of_different_chars) for i in range(sentence_length)]
    random_string=[tf.random.uniform(shape=(), minval=0, maxval=number_of_different_chars, dtype=tf.int32) for i in range(sentence_length)]
    print(random_string)
    return tf.convert_to_tensor(random_string)

# https://stackoverflow.com/questions/47318734/on-the-fly-generation-with-dataset-api-tensorflow
dummy_dataset = tf.data.Dataset.from_tensors(0).repeat(batch_size * epoch_size)
dataset = dummy_dataset.map(lambda _: create_random_string(number_of_different_chars, sentence_length))
dataset = dataset.batch(batch_size)

[<tf.Tensor 'random_uniform:0' shape=() dtype=int32>, <tf.Tensor 'random_uniform_1:0' shape=() dtype=int32>, <tf.Tensor 'random_uniform_2:0' shape=() dtype=int32>, <tf.Tensor 'random_uniform_3:0' shape=() dtype=int32>, <tf.Tensor 'random_uniform_4:0' shape=() dtype=int32>]


In [5]:
### display some examples from the dataset to check
i=0
for element in dataset:
    print(element[:7])   
    i+=1
    if i == 2:
        break

tf.Tensor(
[[2 0 1 1 1]
 [2 2 2 0 1]
 [0 2 2 0 1]
 [0 2 2 1 0]
 [0 0 2 0 0]
 [2 1 0 0 2]
 [0 0 0 2 0]], shape=(7, 5), dtype=int32)


In [6]:
### Hyperparams neural net
intermediate_dim=256

In [7]:
### create the neural net (NN1) of agent1 

NN1_input = keras.Input((sentence_length))
e = keras.layers.Embedding(number_of_different_chars, intermediate_dim)(NN1_input)
#e = keras.layers.Conv1D(intermediate_dim, (1))(e)
NN1_output = keras.layers.Conv1D(number_of_different_intermediate_chars, (1))(e) #returns the logits!! no softmax

NN1 = keras.Model(NN1_input, NN1_output)
NN1.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding (Embedding)        (None, 5, 256)            768       
_________________________________________________________________
conv1d (Conv1D)              (None, 5, 10)             2570      
Total params: 3,338
Trainable params: 3,338
Non-trainable params: 0
_________________________________________________________________


In [8]:
### create the neural net (NN1) of agent1 

NN2_input = keras.Input((sentence_length))
e = keras.layers.Embedding(number_of_different_intermediate_chars, intermediate_dim)(NN2_input)
#e = keras.layers.Conv1D(intermediate_dim, (1))(e)
NN2_output = keras.layers.Conv1D(number_of_different_chars, (1))(e) #returns the logits!! no softmax

NN2 = keras.Model(NN2_input, NN2_output)
NN2.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 5)]               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 5, 256)            2560      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 5, 3)              771       
Total params: 3,331
Trainable params: 3,331
Non-trainable params: 0
_________________________________________________________________


In [9]:
### define reward-function, here: (sentence_length - HammingDistance)/sentence_length
def compute_rewards(predicted_string, correct_string):
    #check where predicted_string[i]=correct_string[i]
    character_matches=tf.math.equal(predicted_string, correct_string)
    #convert True, False to 1, 0
    character_matches_as_ints=tf.cast(character_matches, tf.float32)
    #sum to get the reward
    reward=tf.math.reduce_sum(character_matches_as_ints, axis=-1)
    #divide by sentence_length for correct scaling
    return reward/sentence_length

In [10]:
### test reward function
a=tf.constant([[1, 0, 0, 1, 1], [1, 0, 0, 1, 1], [1, 0, 0, 1, 1]], dtype=tf.int32)
b=tf.constant([[0, 1, 0, 1, 0], [1, 0, 0, 1, 1], [1, 0, 0, 1, 0]], dtype=tf.int32)
compute_rewards(a,b)

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.4, 1. , 0.8], dtype=float32)>

In [11]:
optimizer=tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.0) #, clipnorm=1

In [12]:
def custom_loss_intermediate(agent_output, probability_logits, rewards):
    agent_output_one_hot=tf.one_hot(agent_output, number_of_different_intermediate_chars)
    log_lik = agent_output_one_hot*probability_logits
    return tf.math.reduce_sum(-log_lik*rewards[:, np.newaxis, np.newaxis])

In [13]:
def custom_loss_final(agent_output, probability_logits, rewards):
    agent_output_one_hot=tf.one_hot(agent_output, number_of_different_chars)
    log_lik = agent_output_one_hot*probability_logits
    return tf.math.reduce_sum(-log_lik*rewards[:, np.newaxis, np.newaxis])

In [14]:
### define a train step (for two agents)
def train_step(input_batch, NN1, NN2):
    with tf.GradientTape(persistent=True) as tape:
        
        ## Agent 1
        chararacter_probabilites_logits1 = NN1(input_batch, training=True) #
        dims = chararacter_probabilites_logits1.get_shape().as_list() 
        N = dims[-1]
        logits = tf.reshape(chararacter_probabilites_logits1, [-1, N])
        samples = tf.random.categorical(logits, 1, dtype=tf.int32)
        agent1_output = tf.reshape(samples, dims[:-1])
        
        ## Agent 2
        chararacter_probabilites_logits2 = NN2(agent1_output, training=True) #
        dims = chararacter_probabilites_logits2.get_shape().as_list() 
        N = dims[-1]
        logits = tf.reshape(chararacter_probabilites_logits2, [-1, N])
        samples = tf.random.categorical(logits, 1, dtype=tf.int32)
        agent2_output = tf.reshape(samples, dims[:-1])
        
        ## Compute rewards
        rewards = compute_rewards(agent2_output, input_batch)  
        ## normalize rewards
        mean = tf.math.reduce_mean(rewards)
        std = tf.math.reduce_std(rewards) if tf.math.reduce_std(rewards) > 0 else 1.
        scaled_rewards = (rewards-mean) / std
        
        loss1 = custom_loss_intermediate(agent1_output, chararacter_probabilites_logits1, scaled_rewards)
        loss2 = custom_loss_final(agent2_output, chararacter_probabilites_logits2, scaled_rewards)

    #retrieve gradients
    grads1 = tape.gradient(loss1, NN1.trainable_weights)
    grads2 = tape.gradient(loss2, NN2.trainable_weights)
    del tape
    
    #perform a parameter update
    optimizer.apply_gradients(zip(grads1, NN1.trainable_weights))
    optimizer.apply_gradients(zip(grads2, NN2.trainable_weights))
    
    return rewards

In [15]:
### Training loops
epochs = 10
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    loss_count, epoch_average = 0, 0
    
    for step, input_batch in enumerate(dataset):
        rewards = train_step(input_batch, NN1, NN2)
        print("Training reward (for one batch): ", np.mean(rewards))


Start of epoch 0
Training reward (for one batch):  0.34062505

Start of epoch 1
Training reward (for one batch):  0.409375

Start of epoch 2
Training reward (for one batch):  0.346875

Start of epoch 3
Training reward (for one batch):  0.35625

Start of epoch 4
Training reward (for one batch):  0.296875

Start of epoch 5
Training reward (for one batch):  0.309375

Start of epoch 6
Training reward (for one batch):  0.33437502

Start of epoch 7
Training reward (for one batch):  0.33437502

Start of epoch 8
Training reward (for one batch):  0.35312504

Start of epoch 9
Training reward (for one batch):  0.37500003


In [16]:
def compare_input_and_output(input_batch, neural_net, number_of_examples):
    for i in range(number_of_examples):
        print(list(input_batch[i].numpy()), "\t", np.argmax(neural_net(np.array([input_batch[i]])), axis=-1)[0])

In [17]:
compare_input_and_output(input_batch, NN1, 5)

[0, 0, 1, 2, 1] 	 [2 2 2 2 2]
[1, 2, 0, 2, 0] 	 [2 2 2 2 2]
[1, 1, 2, 0, 2] 	 [2 2 2 2 2]
[1, 0, 1, 1, 1] 	 [2 2 2 2 2]
[1, 2, 0, 0, 0] 	 [2 2 2 2 2]


In [18]:
compare_input_and_output(input_batch, NN2, 5)

[0, 0, 1, 2, 1] 	 [0 0 0 0 0]
[1, 2, 0, 2, 0] 	 [0 0 0 0 0]
[1, 1, 2, 0, 2] 	 [0 0 0 0 0]
[1, 0, 1, 1, 1] 	 [0 0 0 0 0]
[1, 2, 0, 0, 0] 	 [0 0 0 0 0]


In [19]:
# Computes the average over n samples: how many epochs did it take to reach a reward >= threshold once?
def estimate_average_solving_time(n, max_epochs, threshold=1.):
    sum_of_epochs=0
    number_of_max_epochs_reached=0
    for _ in range(n):
        ## create NN1
        NN1_input = keras.Input((sentence_length))
        e = keras.layers.Embedding(number_of_different_chars, intermediate_dim)(NN1_input)
        #e = keras.layers.Conv1D(intermediate_dim, (1))(e)
        NN1_output = keras.layers.Conv1D(number_of_different_intermediate_chars, (1))(e) #returns the logits!! no softmax
        NN1 = keras.Model(NN1_input, NN1_output)
        
        ## create NN2
        NN2_input = keras.Input((sentence_length))
        e = keras.layers.Embedding(number_of_different_intermediate_chars, intermediate_dim)(NN2_input)
        #e = keras.layers.Conv1D(intermediate_dim, (1))(e)
        NN2_output = keras.layers.Conv1D(number_of_different_chars, (1))(e) #returns the logits!! no softmax
        NN2 = keras.Model(NN2_input, NN2_output)

        ## train neural networks and stop when reward >= threshold
        epoch = 0
        reward = 0
        while reward < threshold:    
            for step, input_batch in enumerate(dataset):
                rewards = train_step(input_batch, NN1, NN2)
                reward = np.mean(rewards)       
            epoch+=1
            
            if epoch>max_epochs:
                print("Max epoch reached")
                number_of_max_epochs_reached+=1
                break
            
        if not epoch>max_epochs:
            sum_of_epochs+=epoch
            print(epoch) 
        
    #return average #parameter updates, percentage of not solved
    return sum_of_epochs / (n-number_of_max_epochs_reached), number_of_max_epochs_reached/n

In [20]:
estimate_average_solving_time(50, 100)

44
32
28
29
34
30
31
30
27
39
32
31
31
36
34
33
64
31
31
43
42
37
28
26
32
31
32
29
28
33
34
27
26
30
Max epoch reached
33
Max epoch reached
28
30
29
30
32
24
35
29
29
29
39
39
35


(32.625, 0.04)