### Code Snippet 01: Preparations

#### Set Namespace

I need numerical computing, an environment, a decent iterator, and proper Python 2 printout:

In [None]:
import numpy as np
import gym
from itertools import count
from sys import stdout

#### Set Global Seed

In [None]:
np.random.seed(0)

#### Prepare the Environment

In [None]:
ENV = gym.make("LunarLander-v2")
NUM_INPUT = ENV.reset().shape[0]
NUM_ACTION = ENV.action_space.n
ACTIONS = np.arange(0, NUM_ACTION)

### Code Snippet 02: Exploratory Visualization

#### Create a random action agent

In [None]:
ENV.reset()
for _ in range(1000):
    ENV.render() # comment out for visual assessment, if possible
    ENV.step(ENV.action_space.sample()) # take a random action
    print ENV.step(ENV.action_space.sample())

### Code snippet 03: Create A Toy Artificial Neural Network (ANN)

#### Set Hyperparameters

- NUM_EPISODE denotes the maximum number of episodes an epoch will embrace.
- ALPHA_RANGE is the range of learning rates to be considered.
- NUM_HIDDEN_NEURON_RANGE is the range of hidden layer sizes to be considered.

In [None]:
NUM_EPISODE = 150
ALPHA_RANGE = [10**-4, 10**-2, 1, 10**2, 10**4] # Constant over one epoch
NUM_HIDDEN_NEURON_RANGE = [8, 16, 32, 64, 128, 256, 512] # Constant over one epoch

#### Create the helper functions/ sigmoid functions

In [None]:
# Create the sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Create the sigmoid function's derivative
def sigmoid_derivative(x):
    return x * (1 - x)

#### Build training epoch

In [None]:
# Initialize weights with the correct dimensionality to fit in the input layer
X = np.random.randint(2, size=(4, 3))
y = np.random.randint(2, size=(4, 1))

# Test the hidden dimensions
for NUM_HIDDEN_NEURON in NUM_HIDDEN_NEURON_RANGE: 
    
    # Initialize 1st set of weights
    W1 = np.random.rand(X.shape[1], NUM_HIDDEN_NEURON) 

    # Initialize 2nd set of weights
    W2 = np.random.rand(NUM_HIDDEN_NEURON, y.shape[1]) 
    
    # Test the alphas
    for ALPHA in ALPHA_RANGE: 
        
        for episode in range(NUM_EPISODE):
            # Forward propagate
            
            # Initialize hidden layer (fully connected)
            layer_1 = np.dot(X, W1)
            
            # Apply sigmoid activation
            layer_1 = sigmoid(layer_1) 
            
            # Initialize output layer(fully connected)
            layer_2 = np.dot(layer_1, W2) 
            
            # Apply sigmoid activation 
            layer_2 = sigmoid(layer_2) 

            # Calculate loss
            layer_2_loss = y - layer_2 

            ''' Apply SGD to the loss: the more certain the estimate, the less weighted it will get: 
                The gradient at the extremes is smaller than in the middle
            '''
            layer_2_weighted_loss = layer_2_loss * sigmoid_derivative(layer_2) # element-wise multiplication!
            
            # Backpropagate
            
            # Compute the effect of the hidden layer to the weighted loss
            layer_1_loss = np.dot(layer_2_weighted_loss, W2.T) 

            # Apply SGD
            layer_1_weighted_loss = layer_1_loss * sigmoid_derivative(layer_1) 
                
            # Update weights
            W2 += ALPHA * np.dot(layer_1.T, layer_2_weighted_loss)
            W1 += ALPHA * np.dot(X.T, layer_1_weighted_loss)
            
            # Visualize
            if episode == NUM_EPISODE - 1: print "Hidden Size {}, alpha {}: Avg loss {}".format(
                                                '%4s' % NUM_HIDDEN_NEURON, \
                                                '%7s' % ALPHA, \
                                                '%14s' % np.mean(np.abs(layer_2_loss)))

### Code Snippet 04: Implementation and Refinement:: Q Learning

#### Extend Hyperparameters

- GAMMA is the factor by which future expected rewards are discounted.
- ALPHA is the learning rate.
- Q_TABLE is the agent's memory. The states are the keys, and dictionaries of actions and their respective values are the values.
- VALUE_INIT is the initial value for the actions of a state, once the state is visited the first time. It is set to zero.
- MAX_NUM_STEPS is an arbitrary maximum of allowed steps in case of deadlock

In [None]:
GAMMA = 0.99
ALPHA = 0.1
Q_TABLE = {}
VALUE_INIT = 0
MAX_NUM_STEPS = 10000

#### Build training epoch

In [None]:
def train_ql(render=False):
    # Init epoch stats
    stats = {}
    
    # Init counter for how many times the agent revisited states
    revisiting_states = 0 

    # Play through episodes
    for episode in range(NUM_EPISODE):
        # Init episode stats
        stats[episode] = [0]*3
        
        # Init observations
        x_t = ENV.reset() 
        # Take only the current observation as state, in order to keep the state space as small as possible
        s_t = tuple(x_t) 
        done = False

        # Start an episode
        for step in count():
            # Reset the environment for a new gaming step
            if render: ENV.render()
            
            # Look up the action with the highest value at s_t, as well as its value
            a_t, q, r_s  = best_action(s_t)

            # Observe after action a_t
            x_t1, r_t, done, info = ENV.step(a_t) 
            # Again, take only the current observation
            s_t1 = tuple(x_t1) 
            
            # Look up the action with the highest value at s_t1, as well as its value
            a_t1, Q_sa, _ = best_action(s_t1) 
            
            # Look up the action with the highest value at s_t1, as well as its value
            Q_TABLE[s_t][a_t] = q + ALPHA * (r_t + GAMMA * Q_sa - q) 
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
            
            # Visualize
            stdout.write("\r{} | Step {} @ Episode {}/{} | Step Reward {}".format(\
                EPOCH, step, episode, NUM_EPISODE, round(r_t, 1)))
            
            # Exit episode after crash or deadlock
            if done or step > MAX_NUM_STEPS: break
            
            # Update state
            s_t = s_t1
            revisiting_states += r_s
            
    # Visualize revisiting states issue
    print "\rQ Table size {}, # Revisited States {}".format(len(Q_TABLE), revisiting_states)
    
    return stats
    
    
# Create helper function to initialize and query Q-table
def best_action(state):
    # Create init scenario for table queries at t and t1
    if state not in Q_TABLE or sum(Q_TABLE[state].values()) == 0: 
        # Bookkeeping: Init revisiting states counter
        revisit_state = 0 
        
        # Init q function
        q_function = {} 
        for A in ACTIONS: q_function[A] = VALUE_INIT
        Q_TABLE[state] = q_function 
        
        # Do random action
        action = np.random.choice(ACTIONS, 1)[0] 
    else: 
        revisit_state = 1
        
        # Select action according to max q
        action = max(Q_TABLE[state], key=Q_TABLE[state].get) 
    
    # Get q value for action selected
    q = Q_TABLE[state][action] 
    
    return action, q, revisit_state

### Code Snippet 05: Implementation and Refinement:: Deep Q Learning from Single Current Observations

#### Extend Namespace

From the Keras library, import the basic `Sequential` model, with two different layer types: 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

#### Extend Hyperparameters
- STEP_MEM is the number of steps the agent should take into account as the current state it is in: This is the "operational" memory of the agent.
- NUM_HIDDEN_NEURON is the number of neurons in the hidden layer.
- INITIALIZATION is one of the available weight initializations in Keras.
- ACTIVATION is one of the available activation functions in Keras.

In [None]:
STEP_MEM = 1
NUM_HIDDEN_NEURON = 200
INITIALIZATION = 'glorot_uniform'
ACTIVATION = 'relu'

#### Create the ANN block

In [None]:
def _create_network_1():
    model = Sequential()
    model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION, input_shape=(STEP_MEM*NUM_INPUT,))) 
    model.add(Activation(ACTIVATION))
    model.add(Dense(NUM_ACTION, init=INITIALIZATION))
    model.add(Activation('softmax'))
    model.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])
    return model

#### Build training epoch
The epoch build will always follow the same pattern. New elements or changes w.r.t. the previous epoch are indicated in the code comment

In [None]:
def dqn_1(model, render=False):
    # Init epoch stats
    stats = {}

    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode stats
        stats[episode] = [0]*3
        
        # Init observations
        x_t = ENV.reset()
        # Pile up STEP_MEM times the same init observation, in order to be consistent with the model input
        s_t = np.tile(x_t, STEP_MEM)
        done = False

        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()
    
            # Estimate q for each action at s_t
            q = model.predict(s_t[np.newaxis])[0] 

            # Take action with highest estimated reward (argmax returns index)
            a_t = np.argmax(q) 
            
            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t) 
            
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0) 

            # Estimate q for each action at s_t1 (again a forward pass)
            Q_sa = model.predict(s_t1[np.newaxis])[0] 

            ''' Create reference/targets by updating estimated reward for chosen action:
                For action taken, replace estimated reward by remaining cumulative lifetime reward
            ''' 
            targets = q
            targets[a_t] = r_t + GAMMA * np.max(Q_sa) if not done else r_t

            ''' Learn!
                - Again, predict q values for state s_t
                - Calculate loss by comparing predictions to targets: they will differ only for the action taken
                - backpropagate error for action taken, update weights
            ''' 
            model.fit(s_t[np.newaxis], targets[np.newaxis], verbose=0)
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
            
            # Visualize
            stdout.write("\r{} | Step {} @ Episode {}/{} | Epsilon {} | Step Reward {}".format(\
                EPOCH, step, episode, NUM_EPISODE, epsilon, round(r_t, 1)))
   
            # Exit episode after crash or deadlock
            if done or step > MAX_NUM_STEPS: break

            # Update state
            s_t = s_t1
            
    return stats

### Code Snippet 06: Implementation and Refinement::  *Epsilon Greedy* Deep Q Learning from Single Observations

#### Extend Hyperparameters

- `EPSILON_RANGE` is an interval between the maximum and minimum `epsilon` allowed
- The exploration period (`NUM_EXPLORATION_STEP`) is set to roughly 1/10 of the total number of steps. Assuming an average 100 steps per episode lets `epsilon` decrease until Episode 6. If I increase the average to 150, I end up with the desired 10 episodes until minimum `epsilon`. 

In [None]:
EPSILON_RANGE = [0.1, 0.0001]
NUM_EXPLORATION_STEP = NUM_EPISODE * 15

#### Build Training Epoch

In [None]:
def dqn_2(model, render=False): 
    ##NEW Initialize epsilon at its maximum value
    epsilon = EPSILON_RANGE[0] 
    
    # Init epoch stats
    stats = {}

    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode stats
        stats[episode] = [0]*3
        
        # Init observations        
        x_t = ENV.reset()
        s_t = np.tile(x_t, STEP_MEM)
        done = False

        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()
            
            ###NEW Linarily anneal random exploration rate epsilon over exploration period            
            epsilon = max(epsilon - (EPSILON_RANGE[0] - EPSILON_RANGE[1]) / NUM_EXPLORATION_STEP, EPSILON_RANGE[1])
    
            # Estimate q for each action at s_t
            q = model.predict(s_t[np.newaxis])[0]

            # Take action with highest estimated reward 
            ###NEW Do this with probability 1-epsilon ("epsilon greedy" policy)
            a_t = np.argmax(q) if np.random.random() > epsilon else np.random.choice(ACTIONS, 1)[0]

            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t)
        
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0)

            # Estimate q for each action at s_t1 (again a forward pass)
            Q_sa = model.predict(s_t1[np.newaxis])[0]

            ''' Create reference/targets by updating estimated reward for chosen action
                For action taken, replace estimated reward by remaining cumulative lifetime reward
            ''' 
            targets = q
            targets[a_t] = r_t + GAMMA * np.max(Q_sa) if not done else r_t

            ''' Learn!
                - Again, predict q values for state s_t
                - Calculate loss by comparing predictions to targets: they will differ only for the action taken
                - backpropagate error for action taken, update weights
            ''' 
            model.fit(s_t[np.newaxis], targets[np.newaxis], verbose=0)
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
            
            # Visualize
            stdout.write("\r{} | Step {} @ Episode {}/{} | Epsilon {} | Step Reward {}".format(\
                EPOCH, step, episode, NUM_EPISODE, epsilon, round(r_t, 1)))
   
            # Exit episode after crash or deadlock
            if done or step > MAX_NUM_STEPS: break            

            # Update state
            s_t = s_t1
            
    return stats

### Code Snippet 07: Implementation and Refinement::   Deep Q Learning *from Stored Experiences* 

#### Extend namespace

For the ERM, I need a fast queue

In [None]:
from collections import deque

#### Extend Hyperparameters

- `ERM_SIZE` denotes to size of the memory to sample from. It is set equal to the number of exploration steps.
- `BATCH_SIZE` denotes the size of the sample drawn from `ERM`.

In [None]:
ERM_SIZE = NUM_EXPLORATION_STEP
BATCH_SIZE = 32

#### Build Training Epoch

In [None]:
def dqn_3(model, render=False):
    # Init epsilon and ERM
    epsilon = EPSILON_RANGE[0]
    ERM = deque(maxlen=ERM_SIZE)
    
    # Init epoch stats
    stats = {}
    
    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode stats
        ### NEW: Add episode length stats
        stats[episode] = [0]*3 + [1]
        
        # Init observations        
        x_t = ENV.reset()
        s_t = np.tile(x_t, STEP_MEM)
        done = False

        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()
            
            # Linarily anneal random exploration rate epsilon over exploration period    
            ###NEW Exploration starts only when ERM is complete
            if len(ERM) < ERM_SIZE: epsilon = EPSILON_RANGE[0]
            else: epsilon = max(epsilon - (EPSILON_RANGE[0] - EPSILON_RANGE[1]) / NUM_EXPLORATION_STEP, EPSILON_RANGE[1])
    
            # Estimate q for each action at s_t
            q = model.predict(s_t[np.newaxis])[0]

            # Take action with highest estimated reward with probability 1-epsilon ("epsilon greedy" policy)
            a_t = np.argmax(q) if np.random.random() > epsilon else np.random.choice(ACTIONS, 1)[0]

            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t)
        
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0)
            
            ###NEW Store transition in experience replay memory
            ERM.append((s_t, a_t, r_t, s_t1))

            ###NEW Choose a batch of maximum length BATCH_SIZE
            minibatch = np.array([ ERM[i] for i in np.random.choice(np.arange(0, len(ERM)), min(len(ERM), BATCH_SIZE)) ])
            
            ###NEW Compute targets/reference for each transition in minibatch
            inputs = deque(); targets = deque()
            for m in minibatch:
                inputs.append(m[0]) # Append s_t of batch transition m to inputs
                m_q = model.predict(m[0][np.newaxis])[0] # Estimate rewards for each action (targets), at s_t
                m_Q_sa = model.predict(m[3][np.newaxis])[0] # Estimate rewards for each action (targets), at s_t1
                m_targets = m_q
                m_targets[m[1]] = m[2] + GAMMA * np.max(m_Q_sa)
                targets.append(m_targets) # Append target of batch transition m to targets
                
            ###NEW Train the model by backpropagating the errors and update weights
            model.train_on_batch(np.array(inputs), np.array(targets))
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
            stats[episode][3] = step
            
            # Visualize
            stdout.write("\r{} | Step {} @ Episode {}/{} | Epsilon {} | Step Reward {}".format(\
                EPOCH, step, episode, NUM_EPISODE, epsilon, round(r_t, 1)))
            
            # Exit episode after crash or deadlock
            if done or step > MAX_NUM_STEPS: break           
            
            # Update state
            s_t = s_t1  
            
    return stats

### Code Snippet 08: Implementation and Refinement::   Preparations for Policy Gradient Methods

#### Extend Namespace

In [None]:
from keras.engine import training
import tensorflow as tf

#### Build Keras Models

In [None]:
def _create_network_2(model_type): 
    model = Sequential()
    
    if model_type == 'value':
        # Regression function estimate for calculating the advantage
        model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION_V, input_dim=STEP_MEM*NUM_INPUT)) 
        model.add(Activation(ACTIVATION_V))
        model.add(Dense(1, init=INITIALIZATION_V))
        model.compile(optimizer='rmsprop', loss='mse')
    
    if model_type == 'policy':
        model.add(Dense(NUM_HIDDEN_NEURON, init=INITIALIZATION_P, input_dim=STEP_MEM*NUM_INPUT)) 
        model.add(Activation(ACTIVATION_P))
        model.add(Dense(NUM_ACTION, init=INITIALIZATION_P))
        model.add(Activation('softmax'))
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

#### Build Helper Function for Non-Standard Gradient Estimation

In [None]:
def get_trainable_params(model):
    params = []
    for layer in model.layers:
        params += training.collect_trainable_weights(layer)
    return params

### Code Snippet 09: Implementation and Refinement:: Monte Carlo Policy Gradient

#### Build Training Epoch

In [None]:
def monte_carlo_policy_gradient(model_v, model_p, render=False): 
    # Init epoch stats
    stats = {}
    
    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode transitions & stats
        transitions = []
        stats[episode] = [0]*3 + [1]
        
        # Init observations
        x_t = ENV.reset()
        s_t = np.tile(x_t, STEP_MEM)
        done = False

        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()
            
            # Estimate probabilities for each possible action
            probs = model_p.predict(s_t[np.newaxis])[0]
            
            # Take an action: Sample an action from the probabilities distribution
            a_t = np.random.choice(ACTIONS, p=probs)
            
            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t)
            
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0)
            
            # Keep track of the episode transition and the probability of the action taken
            transitions.append((s_t, a_t, r_t, probs[a_t]))
            
            # Update statistics: Sum up rewards, count steps, successes, solved
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
            stats[episode][3] = step
                
            # Visualize
            stdout.write("\r{} | Step {} @ Episode {}/{} | Epsilon {} | Step Reward {}".format(\
                EPOCH, step, episode, NUM_EPISODE, epsilon, round(r_t, 1)))

            # Exit episode after crash or deadlock
            if done or step > 10000: break 
            
            # Update state
            s_t = s_t1
            
        # Update policy after each episode 
        # Low bias, high variance
        for t, transition in enumerate(transitions):
        
            # Calculate target for action taken
            # -> Total discounted reward after this step during episode
            target = sum(GAMMA**i * j[2] for i, j in enumerate(transitions[t:])) 

            # Update value estimator
            model_v.fit(transition[0][np.newaxis], np.asarray([target]), verbose=0)

            # Estimate baseline for action taken
            baseline = model_v.predict(transition[0][np.newaxis])[0][0]

            # Calculate advantage for action taken
            advantage = target - baseline

            # Caluculate loss
            loss = -np.log(transition[3]) * advantage
            
            # Update policy estimator
            network_params = get_trainable_params(model_p)
            param_grad = tf.gradients(loss, network_params)
    
    return stats

### Code Snippet 10: Implementation and Refinement:: Action-Value (Q) Actor-Critic

#### Build Training Epoch

In [None]:
def q_actor_critic(model_v, model_p, render=False): 
    # Init epoch stats
    stats = {}

    # Play through episodes
    for episode in range(NUM_EPISODE): 
        # Init episode stats
        stats[episode] = [0]*3 + [1]
        
        # Init observations
        x_t = ENV.reset()
        s_t = np.tile(x_t, STEP_MEM)
        done = False
        
        # Start an episode
        for step in count():
            
            # Reset the environment for a new gaming step
            if render: ENV.render()

            # Estimate probabilities for each possible action
            probs = model_p.predict(s_t[np.newaxis])[0]
            
            # Take an action: Sample an action from the returned probabilities distribution
            a_t = np.random.choice(ACTIONS, p=probs)

            # Observe after action a_t
            x_t, r_t, done, info = ENV.step(a_t)
            
            # Create state at t1: Append x observations, throw away the earliest
            s_t1 = np.concatenate((x_t, s_t[:(STEP_MEM-1) * NUM_INPUT,]), axis=0)

            # Bootstrapping from a value function during the episode: high bias, low variance
            # Calculate target for action taken
            # -> Reward for action taken at s_t1
            Q_sa = model_v.predict(s_t1[np.newaxis])[0][0]
            target = r_t + GAMMA * Q_sa

            # Update value estimator
            model_v.fit(s_t[np.newaxis], np.asarray([target]), verbose=0)

            # Estimate baseline for action taken
            baseline = model_v.predict(s_t[np.newaxis])[0][0]

            # Calculate advantage for action taken
            advantage = target - baseline

            # Caluculate loss
            loss = -np.log(probs[a_t]) * advantage

            # Update policy estimator
            network_params = get_trainable_params(model_p)
            param_grad = tf.gradients(loss, network_params)
            
            # Update statistics: Sum up rewards, count steps, successes, solved, measure episode length
            stats[episode][0] += r_t
            if r_t >= 100: stats[episode][1] += 1
            if r_t >= 200: stats[episode][2] += 1
            stats[episode][3] = step

            # Visualize
            stdout.write("\r{} | Step {} @ Episode {}/{} | Epsilon {} | Step Reward {}".format(\
                EPOCH, step, episode, NUM_EPISODE, epsilon, round(r_t, 1)))

            # Exit episode after crash or deadlock
            if done or step > 10000: break 

            # Update state
            s_t = s_t1
            
    return stats

### Code Snippet 11: Model Evaluation and Validation:: Setting up the infrastructure

This shell file represents the last version which I used to set up the infrastructure on a pre-configured Bitfusion Ubuntu 14 TensorFlow instances. Setting up Jupyter, TensorFlow, and NVIDIA drivers are dropped.

    ## AWS Marketplace
    ## Bitfusion Ubuntu 14 TensorFlow - Ubuntu 14.04 LTS (GNU/Linux 3.13.0-95-generic x86_64)
    ## awsmrkt-bfboost-ubuntu14-cuda75-TensorFlow-2016-09-13-130716-dd1e96f9-9ede-4ff5-be40-3419bfca03a3-ami-ac4635bb.3 (ami-94b14cfb)

    ## GO TO SERVER
    ssh -i "my_aws_keys.pem" ubuntu@ec2-35-156-49-218.eu-central-1.compute.amazonaws.com

    # Install the basics
    sudo apt-get update
    sudo apt-get upgrade
    sudo pip install --upgrade pip
    sudo apt-get install -y python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev
    
    # Install the proper swig for Box2D to work
    sudo apt-get install swig3.0
    sudo rm /usr/bin/swig
    sudo ln -s /usr/bin/swig3.0 /usr/bin/swig
    
    # Re-install Box2D
    git clone https://github.com/pyBox2D/pyBox2D.git
    cd pyBox2D
    sudo python setup.py clean
    sudo python setup.py build
    sudo python setup.py install
    
    # Install Gym
    cd ~
    git clone https://github.com/openai/gym.git
    cd gym
    sudo pip install -e '.[all]'

    # Modify the Box2D init in the Gym envs
    cd ~
    nano /gym/gym/envs/Box2D/__init__.py
    # comment all but lunar lander out
    
    # Delete pip version of Box2D
    sudo pip uninstall Box2D-py

    ## GO TO LOCAL
    # Send files to server
    scp -i my_aws_keys.pem ~/CloudStation/Hack/Python/Udacity/MLE/06P_Capstone/04_DQN_PG/Capstone__Reinforcement_Learning.ipynb ubuntu@ec2-35-156-49-218.eu-central-1.compute.amazonaws.com:~/pynb/Capstone__Reinforcement_Learning.ipynb

    ## GO TO SERVER
    # Start notebook
    cd pynb
    jupyter notebook
    #xvfb-run -s "-screen 0 1400x900x24" /bin/bash
    # run without render()

    ## GO TO LOCAL
    # Open notebook (NOT IN SAFARI; use instance ID as password)
    http://ec2-35-156-49-218.eu-central-1.compute.amazonaws.com:8888

    # Send files to local
    scp -i my_aws_keys.pem ubuntu@ec2-35-156-49-218.eu-central-1.compute.amazonaws.com:~/pynb/Capstone__Reinforcement_Learning.ipynb ~/CloudStation/Hack/Python/Udacity/MLE/06P_Capstone/04_DQN_PG/Capstone__Reinforcement_Learning.ipynb

#### Extend Namespace

In [None]:
import json
from os import path, makedirs

#### Define Save Path

In [None]:
SAVE_PATH = path.join(path.expanduser("~"), "RL_Saves")
try: makedirs(SAVE_PATH)
except OSError: 
    if not path.isdir(SAVE_PATH): raise

#### Create Helper Function to Produce Statistics

In [None]:
def _produce_statistics(stats, stats_file=None, epoch_account=None):
    
    # Calculate cumulative epoch statistics
    highest_reward = max([ v[0] for v in stats.values() ])
    successful_steps = sum([ v[1] for v in stats.values() ])
    solved_steps = sum([ v[2] for v in stats.values() ])
    
    # Visualize
    stdout.write("\rEpoch {}, Maximum Reward {}, Successful Episodes {}, Solved Episodes {}".format(\
        EPOCH, highest_reward, successful_steps, solved_steps))

    # Write down working settings
    if stats_file:
        if successful_steps > 0: 
            epoch_account[EPOCH] = [highest_reward, successful_steps, solved_steps]
            with open(stats_file, "w") as outfile: json.dump(epoch_account, outfile) 

#### Create Helper Function to Get Statistics

In [None]:
def _get_statistics(stats_file):
    stats_file_name = path.basename(stats_file)
    best_trial = ('NA', -10000, 0)
    with open(stats_file) as json_file: 
        stats = json.load(json_file)
        for k, v in stats.items():
            if v[2] > 0: print "\nSolved!: {} | {} | {}".format(stats_file_name, k, v)
            if v[1] > best_trial[2] and v[0] > best_trial[1]: best_trial = (k, v[0], v[1])
    return "\nBest trial without solved: {} | {}".format(stats_file_name, best_trial)   

### Code Snippet 12: Model Evaluation and Validation:: From Q Learning to Deep Q Learning

In [None]:
# Initialize model
model = _create_network_1() 

# Create model id
# Insert <EPOCH = "Q-Learning"> for Q Learning here
EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION, ACTIVATION, str(round(GAMMA, 2))])
#EPOCH = "Q-Learning"
                                
# Train model / Play epoch
# insert train_ql, dqn_1, dqn_2, dqn_3 according to the model to be evaluated
stats = dqn_3(model) 

_produce_statistics(stats)

### Code Snippet 13: Justification:: Parameter Space Exploration for Deep Q Learning

#### Extend Hyperparameters

In [None]:
EPSILON_RANGE = [0.2, 0.01] 
NUM_HIDDEN_NEURON_RANGE = [100, 200, 300, 400, 500, 600] # Constant over one epoch
STEP_MEM_RANGE = np.arange(1, 8) # Constant over one epoch

#### Calculate the Parameter Space

In [None]:
print len(STEP_MEM_RANGE)*len(NUM_HIDDEN_NEURON_RANGE)

#### Parameter Space Exploration

In [None]:
# Init epoch overview
epoch_account = {}

# Set output file
stats_file = path.join(SAVE_PATH, "DQN_Stats.json")

# Apply brute force parameter space exploration
for STEP_MEM in STEP_MEM_RANGE:
    for NUM_HIDDEN_NEURON in NUM_HIDDEN_NEURON_RANGE:
        # Initialize model
        try: model = _create_network_1() 
        except Exception: continue

        # Create model id
        EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION, ACTIVATION])

        # Train model / Play epoch
        stats = dqn_3(model) # , render=True
        
        # Produce statistics
        _produce_statistics(stats, stats_file, epoch_account)          

#### Identify Working Parameter Settings

In [None]:
print _get_statistics(stats_file)

### Code Snippet 14: Justification:: Parameter Space Exploration for Policy Gradient Methods, Step 1

#### Extend Hyperparameters

In [None]:
# Select all available initializations and activations of Keras
INITIALIZATIONS = ['normal', 'he_normal', 'glorot_uniform', 'uniform', 'lecun_uniform', 'identity', 'orthogonal', 'zero', 'glorot_normal', 'he_uniform']
ACTIVATIONS = ['softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] 

#### Calculate the Parameter Space

In [None]:
print len(STEP_MEM_RANGE)*len(NUM_HIDDEN_NEURON_RANGE)*len(INITIALIZATIONS)**2*len(ACTIVATIONS)**2


#### Parameter Space Exploration

In [None]:
# Init epoch overview
epoch_accounts = [{}, {}]

# Set output files
stats_files = [path.join(SAVE_PATH, "MCPG_Stats.json"), path.join(SAVE_PATH, "QAC_Stats.json")]

# Apply brute force parameter space exploration
for STEP_MEM in STEP_MEM_RANGE:
    for NUM_HIDDEN_NEURON in NUM_HIDDEN_NEURON_RANGE:
        for INITIALIZATION_V in INITIALIZATIONS:
            for ACTIVATION_V in ACTIVATIONS:
                for INITIALIZATION_P in INITIALIZATIONS:
                    for ACTIVATION_P in ACTIVATIONS:
                        # Initialize models
                        try: model_v, model_p = _create_network_2('value'), _create_network_2('policy') 
                        except Exception: continue

                        # Create model id
                        EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION_V, INITIALIZATION_P, ACTIVATION_V, ACTIVATION_P])

                        # Train model / Play epoch
                        double_stats = (monte_carlo_policy_gradient(model_v, model_p), q_actor_critic(model_v, model_p))

                        # Produce statistics
                        for i, stats in enumerate(double_stats):
                            _produce_statistics(stats, stats_files[i], epoch_accounts[i])  

#### Identify Working Parameter Settings

In [None]:
for stat_file in stats_files:
    print _get_statistics(stat_file)

### Code Snippet 15: Justification:: Parameter Space Exploration for Action-Value (Q) Actor-Critic Policy Gradients, Step 2

#### Alter Hyperparameters

In [None]:
INITIALIZATION_V = 'he_uniform'
INITIALIZATION_P = 'he_uniform'
ACTIVATION_V = 'softplus'
ACTIVATION_P = 'softplus'

#### Parameter Space Exploration

In [None]:
# Init epoch overview
epoch_account = {}

# Set output file
stats_file = path.join(SAVE_PATH, "QAC_Stats_V2.json")

# Apply brute force parameter space exploration
for STEP_MEM in STEP_MEM_RANGE:
    for NUM_HIDDEN_NEURON in NUM_HIDDEN_NEURON_RANGE:
        # Initialize model
        try: model_v, model_p = _create_network_2('value'), _create_network_2('policy')  
        except Exception: raise

        # Create model id
        EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION_V, INITIALIZATION_P, ACTIVATION_V, ACTIVATION_P])

        # Train model / Play epoch
        stats = q_actor_critic(model_v, model_p) # , render=True
        _produce_statistics(stats, stats_file, epoch_account)  

#### Identify Working Parameter Settings

In [None]:
print _get_statistics(stats_file)

### Code Snippet 16: Conclusion:: Display Learning Processes

#### Run through long episodes

In [None]:
NUM_EPISODE = 1000
NUM_HIDDEN_NEURON = 600
STEP_MEM = 1
INITIALIZATION_V = 'he_uniform'
INITIALIZATION_P = 'he_uniform'
ACTIVATION_V = 'softplus'
ACTIVATION_P = 'softplus'

# Initialize model
model_v, model_p = _create_network_2('value'), _create_network_2('policy') 

# Create model id
EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION_V, INITIALIZATION_P, ACTIVATION_V, ACTIVATION_P])

# Train model / Play epoch
stats = q_actor_critic(model_v, model_p)
with open(path.join(SAVE_PATH, "QAC_long_epoch_stats.json"), "w") as outfile: json.dump(stats, outfile) 

In [None]:
NUM_EPISODE = 1000
NUM_EXPLORATION_STEP = NUM_EPISODE * 20 

NUM_HIDDEN_NEURON = 200
STEP_MEM = 6 
INITIALIZATION = 'glorot_uniform'
ACTIVATION = 'relu'

# Initialize model
model = _create_network_1() 

# Create model id
EPOCH = '_'.join([repr(STEP_MEM), str(NUM_HIDDEN_NEURON), INITIALIZATION, ACTIVATION])

# Train model / Play epoch
stats = dqn_3(model) # , render=True
with open(path.join(SAVE_PATH, "DQN_long_epoch_stats.json"), "w") as outfile: json.dump(stats, outfile) 

#### Plot the learning progress

In [None]:
# Extend namespace
%matplotlib inline
import matplotlib.pyplot as plt

# Read in epoch statistics
    # 1: summed up episode reward
    # 2: # successes
    # 3: # solved
    # 4: # episode steps
    
# Get statistics
stats_files = [path.join(SAVE_PATH, "DQN_long_epoch_stats.json"), path.join(SAVE_PATH, "QAC_long_epoch_stats.json")]

# Prepare plots
fontsize = 14

# Plot
for i, stats_file in enumerate(stats_files):
    with open(path.join(stats_file)) as json_file: 
        epoch_stats = json.load(json_file)

    # Calculate epoch statistics
    episodes = sorted([ int(k)+1 for k, v in epoch_stats.items() ])
    episode_rewards = [ v[0] for k, v in epoch_stats.items() ]
    average_episode_rewards = [ v[0] / v[3] for k, v in epoch_stats.items() ]
    successful_steps = [ v[1] for k, v in epoch_stats.items() ]
    solved_steps = [ v[2] for k, v in epoch_stats.items() ]
    episode_lengths = [ v[3] for k, v in epoch_stats.items() ]

    # Show success
    print "Number of successful steps: {}".format(sum(successful_steps))
    print "Number of solved steps: {}".format(sum(solved_steps))

    # Calculate cumulative epoch statistics
    cumulative_successful_steps = 0
    cumulative_successful_steps_list = []
    for successful_step in successful_steps:
        cumulative_successful_steps += successful_step
        cumulative_successful_steps_list.append(cumulative_successful_steps)
    
    # Generate
    plt.figure(i, figsize=(20, 30), dpi=640)
    if i == 0: plot_title = 'Deep Q Learning'
    else: plot_title = 'Action-Value (Q) Actor-Critic'

    plt.subplot(3,1,1) # numrows, numcols, fignum, where fignum ranges from 1 to numrows*numcols
    plt.plot(episodes, average_episode_rewards, color='lightblue')
    plt.title(plot_title)
    plt.xlabel("Episodes", fontsize=fontsize)
    plt.ylabel('Average Reward', fontsize=fontsize)

    plt.subplot(3,1,2) # numrows, numcols, fignum, where fignum ranges from 1 to numrows*numcols
    plt.plot(episodes, episode_rewards, color='blue')
    plt.title(plot_title)
    plt.xlabel("Episodes", fontsize=fontsize)
    plt.ylabel('Reward', fontsize=fontsize)

    plt.subplot(3,1,3) 
    plt.plot(episodes, cumulative_successful_steps_list, color='violet')
    plt.title(plot_title)
    plt.xlabel("Episodes", fontsize=fontsize)
    plt.ylabel('# Successes', fontsize=fontsize)