# Imports

In [2]:
import numpy as np
import _pickle as pickle
import gym
import time
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.initializers as initializers

# Model

In [69]:
env = gym.make("Pong-v0")
initializer = initializers.GlorotNormal

x_in = layers.Input(shape = (6400,))
x = layers.Dense(200, kernel_initializer= initializer, activation="relu")(x_in)
x_out = layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(x_in, x_out)

model.compile(optimizer = "adam", loss = "binary crossentropy")

In [4]:
def policy_forward(change_in_frame):
    up_prob = model(change_in_frame)
    return up_prob

In [4]:
env.observation_space

Box(0, 255, (210, 160, 3), uint8)

In [5]:
env.unwrapped.get_action_meanings()

# NOOP is the same as FIRE (standing still)
# LEFT is the same as LEFTFIRE (down)
# RIGHT is the same as RIGHTFIRE (up)

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [66]:
model.summary()

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, 1, 6400)]         0         
_________________________________________________________________
dense_20 (Dense)             (None, 1, 200)            1280200   
_________________________________________________________________
dense_21 (Dense)             (None, 1, 1)              201       
Total params: 1,280,401
Trainable params: 1,280,401
Non-trainable params: 0
_________________________________________________________________


# Preprocessing

In [5]:
def prepro(input_frame):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    input_frame = input_frame[34:194] # crop
    input_frame = input_frame[::2,::2,0] # downsample by factor of 2 (halves the resolution of the image)
    #This takes every other pixel in the image
    input_frame[input_frame == 144] = 0 # erase background (background type 1)
    input_frame[input_frame == 109] = 0 # erase background (background type 2)
    input_frame[input_frame != 0] = 1 # everything else (paddles, ball) just set to 1
    return input_frame.astype(np.float).ravel()

# Hyperparamters

In [6]:
render = False
prev_frame = None
game_dimensions = 80*80
gamma = 0.99
resume = False
batch_size = 10

# Main Code

In [72]:
observation = env.reset()
reward_sum = 0
eps = 1
episode_number = 0
running_reward = None
ep_change_in_frames, ep_rewards, ep_all_actions, ep_gradient_log_ps = [], [], [], []

if resume:
    model.load_weights("ModelWeights")

while True:
    if render:
        env.render()
    
    curr_frame = prepro(observation)
    change_in_frame = curr_frame - prev_frame if prev_frame is not None else np.zeros(game_dimensions)
    prev_frame = curr_frame
    total_change_in_frames.append(change_in_frame)
    
    if np.random.random() < eps:
        action = np.random.randint(2, 4)
    else:
        change_in_frame = change_in_frame.reshape((1,6400))
        up_prob = model.predict(change_in_frame)
        if up_prob >= .5:
            action = 2
        else:
            action = 3
    
    all_actions.append(action)
    
    observation, reward, done, _ = env.step(action) 
    
    y = 1 if action == 2 else 0
    
    try:
        ep_gradient_log_ps.append(y - up_prob)
    except:
        pass
    
    total_rewards.append(reward)
    reward_sum += reward
    eps -= 0.001
    
    if done:
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print ('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))
        reward_sum = 0
        observation = env.reset() # reset env
        prev_frame = None
        episode_number += 1
        
        if episode_number % batch_size == 0:
            
            # Back Propagation Code Goes Here
            
            model.save_weights("ModelWeights")
    
    if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
        print('ep %d: game finished, reward: %f' % (episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

[[0.51037484]]
2
[[0.4993943]]
3
[[0.5136682]]
2
[[0.51551396]]
2
ep 0: game finished, reward: -1.000000
[[0.48660037]]
3
[[0.46354198]]
3
ep 0: game finished, reward: -1.000000
[[0.4830003]]
3
[[0.51306295]]
2
[[0.4994899]]
3
[[0.47168705]]
3
[[0.4836655]]
3
[[0.52289146]]
2
[[0.5383782]]
2
ep 0: game finished, reward: -1.000000
[[0.4936011]]
3
[[0.42063546]]
3
[[0.5512438]]
2
[[0.4879858]]
3
[[0.4860898]]
3
[[0.47501576]]
3
[[0.49957633]]
3
[[0.49782017]]
3
[[0.5247555]]
2
[[0.5052464]]
2
[[0.5152274]]
2
[[0.49618733]]
3
[[0.4932264]]
3
[[0.51078707]]
2
[[0.47166374]]
3
ep 0: game finished, reward: -1.000000
[[0.5102831]]
2
[[0.5008615]]
2
[[0.5136273]]
2
[[0.5046273]]
2
[[0.48772046]]
3
[[0.51003045]]
2
[[0.5021777]]
2
[[0.51845956]]
2
[[0.5086832]]
2
[[0.49236947]]
3
[[0.48812556]]
3
ep 0: game finished, reward: -1.000000
[[0.48731384]]
3
[[0.4699728]]
3
[[0.519239]]
2
[[0.50756645]]
2
[[0.49359423]]
3
[[0.5031473]]
2
[[0.5010178]]
2
[[0.51829535]]
2
[[0.47215807]]
3
ep 0: game fin

KeyboardInterrupt: 