#### TensorFlow installation

`pip3 install tensorflow`

or

`pip3 install tensorflow-gpu`

#### OpenAI Gym installation

On OSX: 

`brew install cmake boost boost-python sdl2 swig wget`
 
On Ubuntu 16.04:

`apt-get install -y python-pyglet python3-opengl zlib1g-dev libjpeg-dev patchelf cmake swig libboost-all-dev libsdl2-dev libosmesa6-dev xvfb ffmpeg`

On Ubuntu 18.04

`sudo apt install -y python3-dev zlib1g-dev libjpeg-dev cmake swig python-pyglet python3-opengl libboost-all-dev libsdl2-dev libosmesa6-dev patchelf ffmpeg xvfb `

Then:

```
git clone https://github.com/openai/gym.git 

cd gym

pip install -e '.[all]'
```

PyBox2D:

```
git clone https://github.com/pybox2d/pybox2d
cd pybox2d
pip3 install -e .
```


#### Duckietown installation

```
git clone https://github.com/duckietown/gym-duckietown.git
cd gym-duckietown
pip3 install -e .
```

#### Roboschool installation

```
git clone https://github.com/openai/roboschool
cd roboschool
ROBOSCHOOL_PATH=`pwd`
git clone https://github.com/olegklimov/bullet3 -b roboschool_self_collision
mkdir bullet3/build
cd    bullet3/build
cmake -DBUILD_SHARED_LIBS=ON -DUSE_DOUBLE_PRECISION=1 -DCMAKE_INSTALL_PREFIX:PATH=$ROBOSCHOOL_PATH/roboschool/cpp-household/bullet_local_install -DBUILD_CPU_DEMOS=OFF -DBUILD_BULLET2_DEMOS=OFF -DBUILD_EXTRAS=OFF  -DBUILD_UNIT_TESTS=OFF -DBUILD_CLSOCKET=OFF -DBUILD_ENET=OFF -DBUILD_OPENGL3_DEMOS=OFF ..

make -j4
make install
cd ../..
pip3 install -e $ROBOSCHOOL_PATH
```

## RL cycle

In [1]:
import gym
import time

# create the environment 
env = gym.make("CartPole-v1")
# reset the environment before starting
env.reset()

# loop 10 times
for i in range(10):
    # take a random action
    env.step(env.action_space.sample())
    # render the game
    env.render()

time.sleep(10)
    
# close the environment
env.close()

In [2]:
import gym
import time

# create and initialize the environment
env = gym.make("CartPole-v1")
env.reset()

# play 10 games
for i in range(20):
    # initialize the variables
    done = False
    game_rew = 0

    while not done:
        # choose a random action
        action = env.action_space.sample()
        # take a step in the environment
        new_obs, rew, done, info = env.step(action)
        game_rew += rew
        
        env.render() # show what we are doing
        #time.sleep(1)
    
        # when is done, print the cumulative reward of the game and reset the environment
        if done:
            print('Episode %d finished, reward:%d' % (i, game_rew))
            env.reset()
            
# close the environment
env.close()

Episode 0 finished, reward:38
Episode 1 finished, reward:31
Episode 2 finished, reward:24
Episode 3 finished, reward:9
Episode 4 finished, reward:19
Episode 5 finished, reward:61
Episode 6 finished, reward:9
Episode 7 finished, reward:16
Episode 8 finished, reward:38
Episode 9 finished, reward:17
Episode 10 finished, reward:22
Episode 11 finished, reward:19
Episode 12 finished, reward:34
Episode 13 finished, reward:11
Episode 14 finished, reward:10
Episode 15 finished, reward:19
Episode 16 finished, reward:33
Episode 17 finished, reward:18
Episode 18 finished, reward:10
Episode 19 finished, reward:20


In [4]:
import gym

env = gym.make('CartPole-v1')
print(env.observation_space) # Discrete (space that allows a fixed range of non-neg numbers) or Box (n-dim array) class

Box(4,)


In [5]:
print(env.action_space) # Discrete(2): actions have values 0 or 1

Discrete(2)


In [3]:
print(env.action_space.sample()) # draw a sample from the action space
print(env.action_space.sample())
print(env.action_space.sample())

0
1
2


In [4]:
print(env.observation_space.low) # min/max allowed in the Box space

AttributeError: 'Discrete' object has no attribute 'low'

In [8]:
print(env.observation_space.high)

[4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]


## TensorFlow

In [2]:
import tensorflow as tf

# create two constants: a and b
#a = tf.constant(4)
#b = tf.constant(3)

# perform a computation
#c = a + b
#print(c) # print the shape of c

# create a session
#session = tf.Session()
#session = tf.compat.v1.Session() 
# run the session. It compute the sum
#res = session.run(c)
#print(res) # print the actual result

 # Launch the graph in a session.
with tf.compat.v1.Session() as ses:
    # Build a graph.
    a = tf.constant(5.0)
    b = tf.constant(6.0)
    c = a + b

    # Evaluate the tensor `c`.
    print(ses.run(c))

11.0


In [3]:
# reset the graph
#tf.reset_default_graph()
tf.compat.v1.reset_default_graph()

### Tensor

In [4]:
a = tf.constant(1) # tensors are arrays of any number of dimensions
print(a.shape)

()


In [5]:
# array of five elements
b = tf.constant([1,2,3,4,5])
print(b.shape)

(5,)


In [6]:
#NB: a can be of any type of tensor
#a = tf.constant([1,2,3,4,5])
#first_three_elem = a[:3]
#fourth_elem = a[3]

#sess = tf.Session()
#sess = tf.compat.v1.Session()
#print(sess.run(first_three_elem))

with tf.compat.v1.Session() as ses:
    a = tf.constant([1,2,3,4,5])
    first_three_elem = a[:3]
    fourth_elem = a[3]
    print(ses.run(first_three_elem))
    print(ses.run(fourth_elem))

[1 2 3]
4


#### Constant

In [7]:
a = tf.constant([1.0, 1.1, 2.1, 3.1], dtype=tf.float32, name='a_const') # constants are immutable
print(a)

tf.Tensor([1.  1.1 2.1 3.1], shape=(4,), dtype=float32)


#### Placeholder

In [8]:
with tf.compat.v1.Session() as ses:
    a = tf.compat.v1.placeholder(shape=(1,3), dtype=tf.float32) # placeholder is a tensor that is fed at runtime
    b = tf.constant([[10,10,10]], dtype=tf.float32) # e.g. input for models, useful when number of training examples not known

    c = a + b

    #sess = tf.Session()
    #sess = tf.compat.v1.Session()
    res = ses.run(c, feed_dict={a:[[0.1,0.2,0.3]]}) # feed_dict allows to override the value of tensors in the graph
    print(res)

[[10.1 10.2 10.3]]


In [9]:
tf.compat.v1.reset_default_graph()

In [10]:
import numpy as np

with tf.compat.v1.Session() as ses:
    # NB: the fist dimension is 'None', meaning that it can be of any lenght
    a = tf.compat.v1.placeholder(shape=(None,3), dtype=tf.float32)
    b = tf.compat.v1.placeholder(shape=(None,3), dtype=tf.float32)

    c = a + b

    print(a)

    #sess = tf.Session()
    #sess = tf.compat.v1.Session()
    print(ses.run(c, feed_dict={a:[[0.1,0.2,0.3]], b:[[10,10,10]]}))

    v_a = np.array([[1,2,3],[4,5,6]])
    v_b = np.array([[6,5,4],[3,2,1]])
    print(ses.run(c, feed_dict={a:v_a, b:v_b}))

Tensor("Placeholder:0", shape=(None, 3), dtype=float32)
[[10.1 10.2 10.3]]
[[7. 7. 7.]
 [7. 7. 7.]]


In [11]:
with tf.compat.v1.Session() as ses:
    #sess = tf.Session()
    #sess = tf.compat.v1.Session()
    print(ses.run(c, feed_dict={a:[[0.1,0.2,0.3]], b:[[10,10,10]]}))

[[10.1 10.2 10.3]]


#### Variable

In [12]:
tf.compat.v1.reset_default_graph()

with tf.compat.v1.Session() as ses:
    # variable initialized using the glorot uniform initializer; variable is a mutable tensor that can be trained via optimizer
    var = tf.compat.v1.get_variable("first_variable", shape=[1,3], dtype=tf.float32, initializer=tf.initializers.GlorotUniform())
    # e.g. weights and biases in NNs
    
    # variable initialized with constant values
    init_val = np.array([4,5])
    var2 = tf.compat.v1.get_variable("second_variable", shape=[1,2], dtype=tf.int32, initializer=tf.constant_initializer(init_val))

    # create the session
    #sess = tf.Session()
    #sess = tf.compat.v1.Session()
    # initialize all the variables
    ses.run(tf.compat.v1.global_variables_initializer())

    print(ses.run(var))

    print(ses.run(var2))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
[[-0.9595651   0.21389782 -0.43376774]]
[[4 5]]


In [13]:
# not trainable variable
var2 = tf.compat.v1.get_variable("variable", shape=[1,2], trainable=False, dtype=tf.int32)

In [14]:
print(tf.compat.v1.global_variables())

[<tf.Variable 'first_variable:0' shape=(1, 3) dtype=float32>, <tf.Variable 'second_variable:0' shape=(1, 2) dtype=int32>]


#### Graph

In [15]:
tf.compat.v1.reset_default_graph() # graph represents low-level computations of dependencies between operations

with tf.compat.v1.Session() as ses:
    const1 = tf.constant(3.0, name='constant1')

    var = tf.compat.v1.get_variable("variable1", shape=[1,2], dtype=tf.float32)
    var2 = tf.compat.v1.get_variable("variable2", shape=[1,2], trainable=False, dtype=tf.float32)

    op1 = const1 * var
    op2 = op1 + var2
    op3 = tf.reduce_mean(op2)

    #sess = tf.Session()
    #sess = tf.compat.v1.Session()
    ses.run(tf.compat.v1.global_variables_initializer())
    print(ses.run(op3))

1.7184286


### Simple Linear Regression Example


In [16]:
tf.compat.v1.reset_default_graph()

with tf.compat.v1.Session() as ses:

    np.random.seed(10)
    tf.compat.v1.set_random_seed(10)

    W, b = 0.5, 1.4
    # create a dataset of 100 examples
    X = np.linspace(0,100, num=100)
    # add random noise to the y labels
    y = np.random.normal(loc=W * X + b, scale=2.0, size=len(X))

    # create the placeholders
    x_ph = tf.compat.v1.placeholder(shape=[None,], dtype=tf.float32)
    y_ph = tf.compat.v1.placeholder(shape=[None,], dtype=tf.float32)

    # create the variables.
    v_weight = tf.compat.v1.get_variable("weight", shape=[1], dtype=tf.float32)
    v_bias = tf.compat.v1.get_variable("bias", shape=[1], dtype=tf.float32)

    # linear computation
    out = v_weight * x_ph + v_bias

    # compute the Mean Squared Error
    loss = tf.reduce_mean((out - y_ph)**2)

    # optimizer
    opt = tf.compat.v1.train.AdamOptimizer(0.4).minimize(loss)

    # create the session
    #session = tf.Session()
    #sess = tf.compat.v1.Session()
    ses.run(tf.compat.v1.global_variables_initializer())

    # loop to train the parameters
    for ep in range(210):
        # run the optimizer and get the loss
        train_loss, _ = ses.run([loss, opt], feed_dict={x_ph:X, y_ph:y})

        # print epoch number and loss
        if ep % 40 == 0:
            print('Epoch: %3d, MSE: %.4f, W: %.3f, b: %.3f' % (ep, train_loss, ses.run(v_weight), ses.run(v_bias)))

    print('Final weight: %.3f, bias: %.3f' % (ses.run(v_weight), ses.run(v_bias)))

Epoch:   0, MSE: 4572.0996, W: 1.295, b: -0.797
Epoch:  40, MSE: 5.8447, W: 0.497, b: -1.052
Epoch:  80, MSE: 4.8327, W: 0.533, b: -0.243
Epoch: 120, MSE: 4.2012, W: 0.515, b: 0.458
Epoch: 160, MSE: 3.8903, W: 0.507, b: 1.004
Epoch: 200, MSE: 3.7699, W: 0.502, b: 1.372
Final weight: 0.501, bias: 1.433


#### .. with TensorBoard

In [27]:
from datetime import datetime

tf.compat.v1.reset_default_graph()

with tf.compat.v1.Session() as ses:

    np.random.seed(10)
    tf.compat.v1.set_random_seed(10)

    W, b = 0.5, 1.4
    # create a dataset of 100 examples
    X = np.linspace(0,100, num=100)
    # add random noise to the y labels
    y = np.random.normal(loc=W * X + b, scale=2.0, size=len(X))

    # create the placeholders
    x_ph = tf.compat.v1.placeholder(shape=[None,], dtype=tf.float32)
    y_ph = tf.compat.v1.placeholder(shape=[None,], dtype=tf.float32)

    # create the variables.
    v_weight = tf.compat.v1.get_variable("weight", shape=[1], dtype=tf.float32)
    v_bias = tf.compat.v1.get_variable("bias", shape=[1], dtype=tf.float32)

    # linear computation
    out = v_weight * x_ph + v_bias

    # compute the Mean Squared Error
    loss = tf.reduce_mean((out - y_ph)**2)

    # optimizer
    opt = tf.compat.v1.train.AdamOptimizer(0.4).minimize(loss)

    tf.summary.scalar('MSEloss', loss)
    tf.summary.histogram('model_weight', v_weight)
    tf.summary.histogram('model_bias', v_bias)
    all_summary = tf.compat.v1.summary.merge_all()

    now = datetime.now()
    clock_time = "{}_{}.{}.{}".format(now.day, now.hour, now.minute, now.second)
    file_writer = tf.compat.v1.summary.FileWriter('log_dir/'+clock_time, tf.compat.v1.get_default_graph())

    # create the session
    #session = tf.Session()
    #sess = tf.compat.v1.Session()
    ses.run(tf.compat.v1.global_variables_initializer())

    # loop to train the parameters
    for ep in range(210):
        # run the optimizer and get the loss
        #train_loss, _, train_summary = ses.run([loss, opt, all_summary], feed_dict={x_ph:X, y_ph:y})
        train_loss, _ = ses.run([loss, opt], feed_dict={x_ph:X, y_ph:y})
        #file_writer.add_summary(train_summary, ep)

        # print epoch number and loss
        if ep % 40 == 0:
            print('Epoch: %3d, MSE: %.4f, W: %.3f, b: %.3f' % (ep, train_loss, ses.run(v_weight), ses.run(v_bias)))
        
    print('Final weight: %.3f, bias: %.3f' % (ses.run(v_weight), ses.run(v_bias)))
    #file_writer.close()

Epoch:   0, MSE: 4572.0996, W: 1.295, b: -0.797
Epoch:  40, MSE: 5.8447, W: 0.497, b: -1.052
Epoch:  80, MSE: 4.8327, W: 0.533, b: -0.243
Epoch: 120, MSE: 4.2012, W: 0.515, b: 0.458
Epoch: 160, MSE: 3.8903, W: 0.507, b: 1.004
Epoch: 200, MSE: 3.7699, W: 0.502, b: 1.372
Final weight: 0.501, bias: 1.433


In [28]:
import numpy as np
import gym

def eval_state_action(V, s, a, gamma=0.99):
    return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]])

def value_iteration(eps=0.0001):
    '''
    Value iteration algorithm
    '''
    V = np.zeros(nS)
    it = 0

    while True:
        delta = 0
        # update the value of each state using as "policy" the max operator
        for s in range(nS):
            old_v = V[s]
            V[s] = np.max([eval_state_action(V, s, a) for a in range(nA)])
            delta = max(delta, np.abs(old_v - V[s]))

        if delta < eps:
            break
        else:
            print('Iter:', it, ' delta:', np.round(delta, 5))
        it += 1

    return V

def run_episodes(env, V, num_games=100):
    '''
    Run some test games
    '''
    tot_rew = 0
    state = env.reset()

    for _ in range(num_games):
        done = False
        while not done:
            action = np.argmax([eval_state_action(V, state, a) for a in range(nA)])
            next_state, reward, done, _ = env.step(action)

            state = next_state
            tot_rew += reward 
            if done:
                state = env.reset()

    print('Won %i of %i games!'%(tot_rew, num_games))

            
if __name__ == '__main__':
    # create the environment
    env = gym.make('FrozenLake-v0')
    # enwrap it to have additional information from it
    env = env.unwrapped

    # spaces dimension
    nA = env.action_space.n
    nS = env.observation_space.n

    # Value iteration
    V = value_iteration(eps=0.0001)
    # test the value function on 100 games
    run_episodes(env, V, 100)
    # print the state values
    print(V.reshape((4,4)))



Iter: 0  delta: 0.33333
Iter: 1  delta: 0.1463
Iter: 2  delta: 0.10854
Iter: 3  delta: 0.08717
Iter: 4  delta: 0.06736
Iter: 5  delta: 0.05212
Iter: 6  delta: 0.04085
Iter: 7  delta: 0.03384
Iter: 8  delta: 0.02956
Iter: 9  delta: 0.0268
Iter: 10  delta: 0.02425
Iter: 11  delta: 0.02195
Iter: 12  delta: 0.02061
Iter: 13  delta: 0.01962
Iter: 14  delta: 0.019
Iter: 15  delta: 0.01872
Iter: 16  delta: 0.01825
Iter: 17  delta: 0.01765
Iter: 18  delta: 0.01698
Iter: 19  delta: 0.01627
Iter: 20  delta: 0.01554
Iter: 21  delta: 0.01481
Iter: 22  delta: 0.01408
Iter: 23  delta: 0.01337
Iter: 24  delta: 0.01268
Iter: 25  delta: 0.01202
Iter: 26  delta: 0.01138
Iter: 27  delta: 0.01078
Iter: 28  delta: 0.0102
Iter: 29  delta: 0.00965
Iter: 30  delta: 0.00912
Iter: 31  delta: 0.00863
Iter: 32  delta: 0.00816
Iter: 33  delta: 0.00771
Iter: 34  delta: 0.00729
Iter: 35  delta: 0.00689
Iter: 36  delta: 0.00651
Iter: 37  delta: 0.00615
Iter: 38  delta: 0.00581
Iter: 39  delta: 0.0055
Iter: 40  delta:

In [29]:
import numpy as np
import gym

def eval_state_action(V, s, a, gamma=0.99):
    return np.sum([p * (rew + gamma*V[next_s]) for p, next_s, rew, _ in env.P[s][a]])

def policy_evaluation(V, policy, eps=0.0001):
    '''
    Policy evaluation. Update the value function until it reach a steady state
    '''
    while True:
        delta = 0
        # loop over all states
        for s in range(nS):
            old_v = V[s]
            # update V[s] using the Bellman equation
            V[s] = eval_state_action(V, s, policy[s])
            delta = max(delta, np.abs(old_v - V[s]))

        if delta < eps:
            break

def policy_improvement(V, policy):
    '''
    Policy improvement. Update the policy based on the value function
    '''
    policy_stable = True
    for s in range(nS):
        old_a = policy[s]
        # update the policy with the action that bring to the highest state value
        policy[s] = np.argmax([eval_state_action(V, s, a) for a in range(nA)])
        if old_a != policy[s]: 
            policy_stable = False

    return policy_stable


def run_episodes(env, policy, num_games=100):
    '''
    Run some games to test a policy
    '''
    tot_rew = 0
    state = env.reset()

    for _ in range(num_games):
        done = False
        while not done:
            # select the action accordingly to the policy
            next_state, reward, done, _ = env.step(policy[state])
                
            state = next_state
            tot_rew += reward 
            if done:
                state = env.reset()

    print('Won %i of %i games!'%(tot_rew, num_games))

            
if __name__ == '__main__':
    # create the environment
    env = gym.make('FrozenLake-v0')
    # enwrap it to have additional information from it
    env = env.unwrapped

    # spaces dimension
    nA = env.action_space.n
    nS = env.observation_space.n
    
    # initializing value function and policy
    V = np.zeros(nS)
    policy = np.zeros(nS)

    # some useful variable
    policy_stable = False
    it = 0

    while not policy_stable:
        policy_evaluation(V, policy)
        policy_stable = policy_improvement(V, policy)
        it += 1

    print('Converged after %i policy iterations'%(it))
    run_episodes(env, policy)
    print(V.reshape((4,4)))
    print(policy.reshape((4,4)))

Converged after 7 policy iterations
Won 81 of 100 games!
[[0.54091157 0.49730529 0.46893217 0.4549538 ]
 [0.55745963 0.         0.35758788 0.        ]
 [0.59098844 0.64249454 0.61469305 0.        ]
 [0.         0.74131715 0.86263385 0.        ]]
[[0. 3. 3. 3.]
 [0. 0. 0. 0.]
 [3. 1. 0. 0.]
 [0. 2. 1. 0.]]
