In [1]:
import tensorflow as tf
import gym


import os,cv2, datetime,numpy as np
from tensorflow.keras import callbacks
import dataManager
from nets import conv as net

## Model
So first lets get the neural network model

In [2]:
nn = net.NeuralNetwork()
model,_ = nn.getModel((80,80,1),(1))

Model not found
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 80, 80, 8)         80        
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 40, 40, 8)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 40, 40, 16)        1168      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 20, 20, 16)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 20, 20, 32)        4640      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 10, 10, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 10, 

# Environment
Now lets get the Pong environment from OpenAi Gym and define the actions

In [3]:
# gym initialization
env = dataManager.SingleGym("Pong-v0",True,use_diff=True,stack=False)
observation = env.reset()

# Macros
UP_ACTION = 2
DOWN_ACTION = 3

# Logging
to keep track of the progress and save our model we define some callbacks that will be called after the training epochs

In [4]:
today = datetime.datetime.today()
time = "%s%04d-%02d-%02d-%02d-%02d-%02d/" % (nn.getModelFolderPath(),today.year,today.month,today.day,today.hour,today.minute,today.second)
os.makedirs("%s/figs"%(time))

with open('%s/architecture.txt'%(time),'w') as fh:
    model.summary(print_fn=lambda x: fh.write(x + '\n'))
    
csvLogger = callbacks.CSVLogger(time+"log.csv", separator=',', append=False)
tensorboardCallback = callbacks.TensorBoard(log_dir=time)#TODO: load test data in advance and use histogram_freq=1 here
# Creates a file writer for the log directory.
file_writer = tf.summary.create_file_writer(time)
file_writer.set_as_default()


callbacks =  [callbacks.ModelCheckpoint(time+"{epoch:04d}.hdf5",
monitor='val_loss',verbose=1, save_best_only=False, save_weights_only=False, mode='auto', period=5),
csvLogger,tensorboardCallback]



# Run and Train
Now to the real training.  

Given an observation first we sample an action, where we randomly take either value predicted by the network or a random action. The higher the networks confidence the higher the probability that we chose its action over the random action.

We then add the observation to our training input and the action to the labels.  

Now we take the action chosen in the environment and get the new observation and reward.  
This will be repeated until the game (aka. an episode) is over.

Now we'll calulate the discounted rewards and train our model using the gathered training input and labels and using the discounted rewards as sample weights.

In [None]:
# Hyperparameters
gamma = 0.99

# initialization of variables used in the main loop
x_train, y_train, rewards = [],[],[]
reward_sum = 0
episode = 0
running_reward = 0

while True:
    
    # forward the policy network and sample action according to the proba distribution
    upProbability = model.predict(np.array([observation]))[0]
    if np.random.uniform() < upProbability:
        action = UP_ACTION
    else:
        action = DOWN_ACTION
    y = 1 if action == 2 else 0 # 0 and 1 are our labels

    # log the input and label to train later
    x_train.append([observation])
    y_train.append(y)

    # do one step in our environment
    observation, reward, done, info = env.step(action)
    rewards.append(reward)
    reward_sum += reward
    
    #env.env.render()
    # end of an episode
    if done:
        print('At the end of episode', episode, 'the total reward was :', reward_sum)
        
        # training
        model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=2, callbacks=callbacks, sample_weight=dataManager.calculateRewards(rewards, gamma),initial_epoch = episode, epochs = episode+1)
        
        running_reward = running_reward * 0.99 + reward_sum * 0.01
        #Log Running Reward
        tf.summary.scalar("Running Reward", running_reward,step=episode)
        tf.summary.scalar("Reward Sum", reward_sum,step=episode)
        
        
        # increment episode number
        episode += 1
        
        # Reinitialization
        x_train, y_train, rewards = [],[],[]
        observation = env.reset()
        reward_sum = 0
        prev_input = None