# Pong from pixels

Problem:<br>
In Reinforcement Learning, we don't have Labels.<br><br>

Solution:<br>
Create fake labels!

Supervised Learning:<br><br>
$\max\sum_i\log p(y_i|x_i)$
<br><br><br><br>
Reinforcement Learning:<br><br>
$y_i\sim p(\cdot|x_i)$<br><br>
$\max\sum_i A_i \cdot \log p(y_i|x_i)$
<br><br><br><br>
$A_i$ is called the advantage (modification of the reward)

In [1]:
import numpy as np

import matplotlib.pyplot as plt
import pickle
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, Flatten, Layer, Input, MaxPooling2D

Using TensorFlow backend.


In [2]:
# Preprocessing
def preprocessing(I):
    I = I[35:195]
    I = I[::2,::2,0]
    I[I == 144] = 0
    I[I == 109] = 0
    I[I != 0] = 1
    return I.astype(np.float)


# add discount factor and standardize rewards
def discount_rewards(r, gamma):
    discounted_r = np.zeros_like(r)
    running_add = 0
    
    for t in reversed(range(0, r.size)):
        if r[t] != 0:
            running_add = 0 # reset reward to 0 on game end
        
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add

    # Standardize rewards (for gaussian distribution):
    # 1. Expected Value => 0
    discounted_r -= np.mean(discounted_r)
    # 2. Variance => 1
    discounted_r /= np.std(discounted_r)
    
    return discounted_r
    

# build model
def build_cnn():
    model = Sequential()
    model.add(Conv2D(32, 3, activation='relu', input_shape=(80,80,1)))
    model.add(MaxPooling2D())
    model.add(Conv2D(16, 3, activation='relu'))
    model.add(MaxPooling2D())
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adadelta')
    return model

def build_mlp():
    model.add(Dense(200, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adadelta')
    return model

In [5]:
# Training Params:
batch_size = 10

# action distribution
actions = [[0, [1, 0, 0]], 
           [2, [0, 1, 0]], 
           [3, [0, 0, 1]]]

# Discount factor for reward
gamma = 0.99

env = gym.make("Pong-v0")
observation = env.reset()

reward_sum = 0
running_reward = None

model = build_cnn()

samples, a_probs, ys, rewards = [], [], [], []

previous_x = np.zeros((80, 80))

file_name = './cnn.h5'
# model.load_weights(file_name)

e = 0
n = 0
while True:
    # env.render()
    # preprocess image
    current_x = preprocessing(observation) * 2
    x = current_x - previous_x
    previous_x = current_x / 2

    # sample a random action based on their probabilities
    aprob = model.predict(x.reshape(1,80,80,1)).flatten()
    action_id = np.random.choice(range(3), p=aprob)

    action = actions[action_id][0]  # action input for environment
    y = actions[action_id][1]       # fake label distribution

    # perform the sampled action in the environment
    observation, reward, done, info = env.step(action)

    # update sum of rewards
    reward_sum += reward

    # record values
    samples.append(x.reshape(80,80,1))
    ys.append(y)
    a_probs.append(aprob) 
    rewards.append(reward)
    
    n += 1

    # Episode ends
    if done:
        rewards = np.array(rewards)
        rewards = rewards.reshape(rewards.shape[0], 1)

        smpls = np.array(samples)
        smpls.reshape(len(samples), 80,80,1)

        # Magic begins
        discounted_r = discount_rewards(rewards, gamma)

        diffs = np.array(ys) - np.array(a_probs)
        diffs *= discounted_r
        diffs += np.array(a_probs) # needed for keras!
        # Magic ends


        model.fit(smpls, diffs, epochs=1, verbose=0)

        samples, a_probs, ys, rewards = [], [], [], []
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print('episode [%d] reward total was %f. running mean: %f, num steps: %d'%(e, reward_sum, running_reward, n))
        reward_sum = 0
        previous_x = np.zeros((80, 80))
        observation = env.reset()
        model.save_weights(file_name)
        e += 1
        n = 0

[2019-10-28 15:33:02,534] Making new env: Pong-v0


episode [0] reward total was -20.000000. running mean: -20.000000, num steps: 1217
episode [1] reward total was -19.000000. running mean: -19.990000, num steps: 1427
episode [2] reward total was -18.000000. running mean: -19.970100, num steps: 1648
episode [3] reward total was -19.000000. running mean: -19.960399, num steps: 1362


KeyboardInterrupt: 