In [1]:
import tensorflow as tf
from datetime import datetime
from os import listdir

from src.db_registry import registry
from src.pre_calculate_reward import PreCalculateReward
from src.config import *
from src.q_models import *

In [2]:
pre_calculate_reward = PreCalculateReward()

Completed reward pre calculation for 0 races


In [3]:
REPLAY_BUFFER_MERGED_ELEMENT_SPEC =  \
            (tf.TensorSpec((135, 180, 3), dtype=tf.uint8),
             (tf.TensorSpec((3,), dtype=tf.uint8),
              tf.TensorSpec((3,), dtype=tf.uint8)))


REWARD_BUFFER_ELEMENT_SPEC =  \
            (tf.TensorSpec((), dtype=tf.float32))

In [4]:
class LearningStep:
    def __init__(self, lr, epochs, use_discount):
        self.lr = lr
        self.epochs = epochs
        self.use_discount = use_discount
        
    def get_lr(self):
        return self.lr
    
    def get_epochs(self):
        return self.epochs
    
    def get_use_discount(self):
        return self.use_discount
    
    
learning_step_collection = [[LearningStep(0.00025, 3, 0),
                            LearningStep(0.000025, 1, 0),
                            LearningStep(0.00025, 3, 1),
                            LearningStep(0.000025, 1, 1)]]
                            

start_skip = 5
skips = 1
    

optimizer = tf.keras.optimizers.RMSprop
loss = "mse" #tf.keras.losses.Huber()

q_model_description = "deep convolutional network v3 disc 0.99 ABSOLUTE SARSA 5/ 1 skips, 0.00025 x 3, 0.000025 x 1(auto norm)"

In [5]:
def preprocess_dataset(filename):
    dataset_base = tf.data.experimental.load(REPLAY_BUFFER_PATH + filename, REPLAY_BUFFER_MERGED_ELEMENT_SPEC,
                                            compression="GZIP")
    dataset_reward = tf.data.experimental.load(REWARD_BUFFER_PATH + filename, REWARD_BUFFER_ELEMENT_SPEC)
    
    dataset_base = tf.data.Dataset.zip(
        (dataset_base, dataset_base.skip(start_skip + skips)))
    dataset_reward =  tf.data.Dataset.zip(
        tuple([dataset_reward] + [dataset_reward.skip(i + start_skip + 1) for i in range(skips)]))
    
    dataset = tf.data.Dataset.zip((dataset_base, dataset_reward))
    return dataset


def batch_calculations_dataset(*x):
    with tf.device("gpu:0"):
        zipped_rewards = [i for i in x[1]][1:]
        
        rewards = tf.stack(zipped_rewards, axis=0)
        
        discounts = tf.constant([DISCOUNT_FACTOR**(i + start_skip) for i in range(len((zipped_rewards)))])
        
        discounted_rewards = tf.transpose(rewards) * discounts
        
        rewards = tf.math.reduce_sum(tf.transpose(discounted_rewards), axis=0)
    
        rewards = (rewards - norm_mean) / norm_stdev
        
        rewards = tf.reshape(rewards, [-1, 1])
        
        if use_discount == 1:
            q_discount = DISCOUNT_FACTOR ** skips
            
            next_reward_params = target_q_model(tf.cast(x[0][1][0], dtype=tf.float32) / 255.0)

            next_y0 = tf.math.reduce_max(next_reward_params[:][0], axis=1)
            next_y1 = tf.math.reduce_max(next_reward_params[:][1], axis=1)
            next_y0 = tf.reshape(next_y0 * q_discount, [-1, 1])
            next_y1 = tf.reshape(next_y1 * q_discount, [-1, 1])

            rewards = (rewards + next_y0)
            rewards = (rewards - norm_mean_discounted) / norm_stdev_discounted
        
        actions = (tf.cast(x[0][0][1][:][0], dtype=tf.float32),
                  tf.cast(x[0][0][1][:][1], dtype=tf.float32))
        
        return tf.cast(x[0][0][0], dtype=tf.float32) / 255.0, actions, rewards

In [6]:
def get_dataset():
    filenames = [obj for obj in listdir(REWARD_BUFFER_PATH)]
    dataset = tf.data.Dataset.from_tensor_slices(filenames)
    dataset = dataset.shuffle(len(filenames))
    dataset = dataset.flat_map(preprocess_dataset)
    dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE)
    dataset = dataset.batch(BATCH_SIZE_Q_MODEL_TRAINING).prefetch(tf.data.experimental.AUTOTUNE)
    dataset = dataset.map(batch_calculations_dataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset

In [7]:
use_discount = tf.Variable(0, dtype=tf.int32)
norm_mean = tf.Variable(0.0, dtype=tf.float32)
norm_stdev = tf.Variable(1.0, dtype=tf.float32)

norm_mean_discounted = tf.Variable(0.0, dtype=tf.float32)
norm_stdev_discounted = tf.Variable(1.0, dtype=tf.float32)

target_q_model = deep_convolutional_network_v3()
norm_layer = tf.keras.layers.experimental.preprocessing.Normalization()


def dataset_reward(*x):
    return x[2]


def adapt_normalization():
    norm_layer.adapt(get_dataset().map(dataset_reward).take(NORMALIZATION_BATCH_COUNT))
    mean = norm_layer.mean.numpy()[0]
    stdev = norm_layer.variance.numpy()[0]**0.5
    norm_mean.assign(mean)
    norm_stdev.assign(stdev)
    print("Reward normalization:")
    print("mean: ", mean)
    print("stdev:", stdev)
    
    
def adapt_discounted_normalization():
    norm_layer.adapt(get_dataset().map(dataset_reward).take(NORMALIZATION_BATCH_COUNT))
    mean = norm_layer.mean.numpy()[0]
    stdev = norm_layer.variance.numpy()[0]**0.5
    norm_mean_discounted.assign(mean)
    norm_stdev_discounted.assign(stdev)
    print("Discounted reward normalization:")
    print("mean: ", mean)
    print("stdev:", stdev)

In [8]:
for learning_steps in learning_step_collection:
    adapt_normalization()
    
    q_model = deep_convolutional_network_v3()
    
    q_model.compile(optimizer="RMSprop", loss=[loss, loss])
    #q_model.fit(get_dataset().take(1))
    #q_model.summary()
    
    target_q_model = q_model
    
    models_created = 0
    model_id = get_new_id()
    target_q_model_id = ""
    use_discount = tf.Variable(0, dtype=tf.int32)
    lr = 0.0000001

    for learning_step in learning_steps:
        if learning_step.get_lr() > lr and learning_step.get_use_discount():
            registry.add_q_model(model_id + str(models_created), 
                                 datetime.now().timestamp(), q_model_description)
            target_q_model_id = model_id + str(models_created) 
            
            target_q_model = tf.keras.models.load_model(Q_MODEL_PATH + target_q_model_id)
            q_model = deep_convolutional_network_v3()
            q_model.compile(optimizer="RMSprop", loss=[loss, loss])
            
            use_discount.assign(learning_step.get_use_discount())
            adapt_discounted_normalization()
        
        lr = learning_step.get_lr()
        
        q_model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=learning_step.get_lr()), loss=[loss, loss])

        for epoch in range(learning_step.get_epochs()):
            print(target_q_model_id)
            q_model.fit(get_dataset())

            models_created += 1
            q_model.save(Q_MODEL_PATH + model_id + str(models_created))
            
    registry.add_q_model(model_id + str(models_created), datetime.now().timestamp(), q_model_description)
    print(" ")
    print("model saved: ", model_id + str(models_created))
    print(" ")

Reward normalization:
mean:  16.299524
stdev: 11.170251602986543

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: D:/ml_mk/tony_kart/q_models/DESKTOP-IISUQP6_20210305_1813441\assets

INFO:tensorflow:Assets written to: D:/ml_mk/tony_kart/q_models/DESKTOP-IISUQP6_20210305_1813442\assets

INFO:tensorflow:Assets written to: D:/ml_mk/tony_kart/q_models/DESKTOP-IISUQP6_20210305_1813443\assets

INFO:tensorflow:Assets written to: D:/ml_mk/tony_kart/q_models/DESKTOP-IISUQP6_20210305_1813444\assets
Discounted reward normalization:
mean:  0.23085572
stdev: 1.6337999342370273
DESKTOP-IISUQP6_20210305_1813444
INFO:tensorflow:Assets written to: D:/ml_mk/tony_kart/q_models/DESKTOP-IISUQP6_20210305_1813445\assets
DESKTOP-IISUQP6_20210305_1813444
INFO:tensorflow:Assets written to: D:/m

In [9]:
dataset_test = get_dataset()

In [11]:
use_discount.assign(1)

loop_count = 0

min0 = 1.0
max0 = 0.0

for elem in dataset_test:
    loop_count += 1
    y0 = tf.math.reduce_max(elem[1][0], axis=1) * elem[2][0]
    
    min0 = min(min0, min(y0).numpy())
    max0 = max(max0, max(y0).numpy())
    

    if loop_count % 10 == 0:
        print("==============")
        print("min0: ", min0)
        print("max0: ", max0)

min0:  -0.57516426
max0:  2.3428164
min0:  -0.8193543
max0:  2.7135696
min0:  -1.1331646
max0:  2.7135696
min0:  -1.4778433
max0:  2.7135696
