In [None]:
import tensorflow as tf
import gym


import os,cv2, datetime,numpy as np
from tensorflow.keras import callbacks
import dataManager
from nets import simpleDense as net

In [None]:
nn = net.NeuralNetwork()
model,_ = nn.getModel((80*80),(1))

In [None]:
# gym initialization
env = dataManager.SingleGym("Pong-v0",True)
observation = env.reset()

# Macros
UP_ACTION = 2
DOWN_ACTION = 3

In [None]:
today = datetime.datetime.today()
time = "%s%04d-%02d-%02d-%02d-%02d-%02d/" % (nn.getModelFolderPath(),today.year,today.month,today.day,today.hour,today.minute,today.second)
os.makedirs("%s/figs"%(time))

with open('%s/architecture.txt'%(time),'w') as fh:
    model.summary(print_fn=lambda x: fh.write(x + '\n'))
    
csvLogger = callbacks.CSVLogger(time+"log.csv", separator=',', append=False)
tensorboardCallback = callbacks.TensorBoard(log_dir=time)#TODO: load test data in advance and use histogram_freq=1 here
# Creates a file writer for the log directory.
file_writer = tf.summary.create_file_writer(time)
file_writer.set_as_default()


callbacks =  [callbacks.ModelCheckpoint(time+"{epoch:04d}.hdf5",
monitor='val_loss',verbose=1, save_best_only=False, save_weights_only=False, mode='auto', period=1),
csvLogger,tensorboardCallback]

In [None]:
# Hyperparameters
gamma = 0.99

# initialization of variables used in the main loop
x_train, y_train, rewards = [],[],[]
reward_sum = 0
episode = 0


while True:
    
    # forward the policy network and sample action according to the proba distribution
    upProbability = model.predict(np.expand_dims(observation, axis=1).T)
    if np.random.uniform() < upProbability:
        action = UP_ACTION
    else:
        action = DOWN_ACTION
    y = 1 if action == 2 else 0 # 0 and 1 are our labels

    # log the input and label to train later
    x_train.append(observation)
    y_train.append(y)

    # do one step in our environment
    observation, reward, done, info = env.step(action)
    rewards.append(reward)
    reward_sum += reward
    running_reward = 0
    
    # end of an episode
    if done:
        print('At the end of episode', episode, 'the total reward was :', reward_sum)
        
        # training
        model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, callbacks=callbacks, sample_weight=dataManager.calculateRewards(rewards, gamma),initial_epoch = episode)
        
        #Log Running Reward
        tf.summary.scalar("Running Reward", running_reward,step=episode)
        
        
        # increment episode number
        episode += 1
        
        # Reinitialization
        x_train, y_train, rewards = [],[],[]
        observation = env.reset()
        reward_sum = 0
        prev_input = None