In [1]:
%load_ext autoreload
%autoreload 2

import math
import random
import tempfile
import tensorflow as tf
import time

import numpy as np

from collections import defaultdict

from tf_rl.controller import KerasDDPG
from tf_rl.models     import PolicyMLP, ValueMLP
from tf_rl.simulation import SinglePendulum
from tf_rl            import simulate

from keras import backend as K

SINGLE_PENDULUM_PARAMS = {
    'g_ms2': 9.8, # acceleration due to gravity, in m/s^2
    'l1_m': 1.0, # length of pendulum 1 in m
    'm1_kg': 1.0, # mass of pendulum 1 in kg
    'damping': 0.2,
    'max_control_input': 4.0
}

Using TensorFlow backend.


In [2]:
actor = PolicyMLP(SinglePendulum.observation_size, [200, 200, 1], ['relu', 'relu', 'tanh'])
critic = ValueMLP(SinglePendulum.observation_size,SinglePendulum.action_size,[200, 200, 1],['relu', 'relu', 'linear'], regularizer=True)

In [None]:
for i, w in enumerate(critic.model.get_weights()):
    print 'Layer %d: %d' % (i, len(w))
    
len(critic.model.nodes)

In [3]:
current_controller = KerasDDPG(SinglePendulum.observation_size, 
                               SinglePendulum.action_size, actor, critic, discount_rate=0.99, 
                               exploration_period=30000)

In [None]:
fast_mode = False

if fast_mode:
    FPS, SPEED, RES = 5, 20.0, 0.03
else:
    FPS, SPEED, RES = 30, 1., 0.01

try:
    while True:
        d = SinglePendulum(SINGLE_PENDULUM_PARAMS)
        simulate(d, current_controller, fps=FPS,
                 simulation_resolution=RES,
                 action_every=4,
                 disable_training=False,
                 reset_every=3000,
                 visualize=True)
except KeyboardInterrupt:
    print("Interrupted")

In [None]:
c_grad = K.gradients(critic.model.get_output(train=False)['value_output'], critic.model.get_input(train=False)['action'])[0]

In [None]:
s1,s2,s3,s4,act_grad = tf.split(1,5,c_grad[0])

In [None]:
minibatch_size=32

In [None]:
minibatch = K.variable(minibatch_size)

In [None]:
actor.model.trainable_weights

In [None]:
a_grad_1 = [(K.gradients(K.sum(actor.model.get_output(train=False),axis=1), z)[0] for z in actor.model.trainable_weights)]

In [None]:
a_grad_1_2 = [a_grad_1 / minibatch]

In [None]:
simple_test = [K.sum(actor.model.get_output(train=False),axis=1)]

In [None]:
simp_test_func = K.function(inputs=[actor.model.get_input(train=False)], outputs=simple_test)

In [None]:
simp_test_func([test_input_a])

In [None]:
a_grad_2 = [K.gradients(K.sum(actor.model.get_output(train=False)), actor.model.trainable_weights)[0]]

In [None]:
test_update_1 = K.function(inputs=[critic.model.get_input(train=False),actor.model.get_input(train=False)], outputs=a_grad_1)

In [None]:
test_update_2 = K.function(inputs=[critic.model.get_input(train=False),actor.model.get_input(train=False)], outputs=a_grad_2)

In [None]:
test_input_cs = np.random.rand(32,2)

In [None]:
test_input_ca = np.random.rand(32,1)

In [None]:
test_cgrad = K.function(inputs=[critic.model.get_input(train=False)['state'], critic.model.get_input(train=False)['action']], outputs=[c_grad])

In [None]:
test_input_a = np.random.rand(32,4)

In [None]:
c_grad_1 = test_cgrad([test_input_cs, test_input_ca])

In [None]:
c_grad_1

In [None]:
p_grad_1

In [None]:
p_grad_2

In [None]:
current_controller.plot_critic_value_function()

In [None]:
current_controller.plot_actor_policy()

In [None]:
current_controller.plot_critic_value_function()

In [None]:
current_controller.plot_actor_policy()

In [None]:
current_controller.bellman_error[-10:]

In [None]:
print len(current_controller.experience)

In [None]:
current_controller.restore_checkpoint('/home/mderry/tensorflow-deepq/notebooks/logs/pendulum_checkpoint_16601')

In [7]:
fast_mode = False

if fast_mode:
    FPS, SPEED, RES = 5, 20.0, 0.03
else:
    FPS, SPEED, RES = 30, 1., 0.01

SINGLE_PENDULUM_PARAMS = {
    'g_ms2': 9.8, # acceleration due to gravity, in m/s^2
    'l1_m': 1.0, # length of pendulum 1 in m
    'm1_kg': 1.0, # mass of pendulum 1 in kg
    'damping': 0.2,
    'max_control_input': 4.0
}
d = SinglePendulum(SINGLE_PENDULUM_PARAMS)
try:
    while True:
        simulate(d, current_controller, fps=FPS, simulation_resolution=RES, wait=False, action_every=4, disable_training=True, ignore_exploration=True)
except KeyboardInterrupt:
    print("Interrupted")

Interrupted


In [None]:
from pympler import tracker

In [None]:
mem_tracker = tracker.SummaryTracker()

In [None]:
mem_tracker.print_diff()