In [1]:
import gym, torch
from torch import nn
import numpy as np
import homework.hw1.load_policy as load_policy
import homework.hw1.tf_util as tf_util
import sys
import tensorflow as tf
import numba
import time

sigma = 0.1

dim = 17
dim_act = 6
obs_min = np.array([-0.25,-0.8 ,-0.7 ,-0.7,-0.65,-0.75,-0.95,-0.6 ,-1.5,-3.1,-7.1,-20,-24,-27,-27,-30,-20])
obs_max = np.array([ 0.4 , 1.65, 0.95, 0.9, 0.95, 0.95, 1.1 , 0.75, 8  , 3.4, 7  , 19, 25, 22, 25, 32, 26])

act_norm_init = np.array([2.3,2.1, 1.85,1.8 , 1.5,2.0, 2.1,2.65, 2.3,2.5, 1.9,2.3])

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
"""

INITIALISE ACTION AND OBSERVATION SPACE

"""

actions = []
actions_matrix = np.load('HC_actions.npy')
N_a = 8
for i in range(N_a):
    actions.append(actions_matrix[i,:])

In [3]:
"""

NEEDED FUNCTIONS

"""

''' u(xi,x), which evaluates the function u with coefficients xi at x, fast version '''
@numba.njit()
def u(xi,x):
    u_sum = 0
    for j in range(dim):
        
        ''' 1D terms '''
        j_eval = np.pi*(x[j]-obs_min[j])/(obs_max[j]-obs_min[j])
        for i in range(1,N_trunc):
            c_j = 2/(obs_max[j]-obs_min[j])
            u_sum += c_j*xi[2*i-2,j,0]*np.cos(i*j_eval)
            u_sum += c_j*xi[2*i-1,j,0]*np.sin(i*j_eval)
            
        ''' 2D terms '''   
        for k in range(dim):
            if k>j:
                j_eval = np.pi*(x[j]-obs_min[j])/(obs_max[j]-obs_min[j])
                k_eval = np.pi*(x[k]-obs_min[k])/(obs_max[k]-obs_min[k])
                for i_1 in range(1,N_trunc):
                    for i_2 in range(1,N_trunc):
                        c_jk = 4/(obs_max[j]-obs_min[j])/(obs_max[k]-obs_min[k])
                        u_sum += c_jk*xi[4*(i_1+N_trunc*i_2)-4,j,k]*np.cos(i_1*j_eval)*np.cos(i_2*k_eval)
                        u_sum += c_jk*xi[4*(i_1+N_trunc*i_2)-3,j,k]*np.cos(i_1*j_eval)*np.sin(i_2*k_eval)
                        u_sum += c_jk*xi[4*(i_1+N_trunc*i_2)-2,j,k]*np.sin(i_1*j_eval)*np.cos(i_2*k_eval)
                        u_sum += c_jk*xi[4*(i_1+N_trunc*i_2)-1,j,k]*np.sin(i_1*j_eval)*np.sin(i_2*k_eval)
    return u_sum
    
def policy(obs):
    return policy_fn(obs[None,:])
print('loading and building expert policy')
policy_fn = load_policy.load_policy('/Users/torbensell/Dropbox (Cambridge University)/UNI/CAM/PhD/Programme/BIRL_NN_pCN/homework/hw1/experts/HalfCheetah-v2.pkl')
print('loaded and built')

''' Progress bar to know how much longer one has to wait '''
def progressBar(value, endvalue, bar_length=40):
    percent = float(value) / endvalue
    arrow = '-' * int(round(percent * bar_length)-1) + '>'
    spaces = ' ' * (bar_length - len(arrow))
    sys.stdout.write("\rPercent: [{0}] {1}%".format(arrow + spaces, int(round(percent * 100))))
    sys.stdout.flush() 
    
def discrete_action(action_vector):
    if np.random.uniform()<0.01:
        ''' With 1% probability, return a random action '''
        rand = np.random.randint(0,N_a)
        return rand,actions[rand]
    else:
        distances = np.zeros(N_a)
        for i in range(N_a):
            distances[i] = np.linalg.norm(action_vector-actions[i])
        action = np.argmin(distances) # index of action
        return action,actions[action]
    
def discrete_action_no_noise(action_vector):
    distances = np.zeros(N_a)
    for i in range(N_a):
        distances[i] = np.linalg.norm(action_vector-actions[i])
    action = np.argmin(distances) # index of action
    return action,actions[action]

loading and building expert policy
obs (1, 17) (1, 17)
loaded and built


In [4]:
"""

TEST 1 (pCN) - check if policy learned behaviour 

"""  


distances = [[],[],[]] # stores distances

""" run optimal policy to store distances using optimal policy """
print('Test started: ' + str(time.ctime()))
with tf.Session():
    tf_util.initialize()   
    env = gym.make('HalfCheetah-v2')
    env.unwrapped
    
    for _ in range(10):
#         input("Press Enter to start visualisation...")
        obs = env.reset()
        start = env.sim.data.qpos[0]    
        for k in range(100):
#             env.render()
            obs = env.step(discrete_action_no_noise(policy(obs))[1])[0] 
        print('Distance covered by Cheetah = ' + str(env.sim.data.qpos[0]-start) + '\n')    
        distances[0].append(env.sim.data.qpos[0]-start)
print('optimal policy done')




""" load samples - pCN """

method = 'pCN'
N_data = 100
N_trunc = 5

xi = []
for it in range(1000):
    xi.append(np.load('np_saved/HC/samples_policy_learning/KL_'+str(N_trunc)+'_'+method+'_NData'+str(N_data)+'_sampleNo'+str(it)+'.npy'))
                    
        
""" run tests - pCN """ 

v_a = np.zeros(N_a)
v_a_with_noise = np.zeros(N_a)
    
print('\n\nTest started: ' + str(time.ctime()))
with tf.Session():
    tf_util.initialize()   
    env = gym.make('HalfCheetah-v2')
    env.unwrapped
    
    for _ in range(10):
        for N_data in [100]:
            obs = env.reset()
            errors = 0  # to keep track of how many moves would be the same if using true policy
            corrects = 0
            
            ''' Store values taken to reproduce behaviour '''
            poss = []
            vels = []
            for k in range(100):
                progressBar(k+1,100)

                poss.append(obs[0:8])
                vels.append(obs[8:17])
                poss[k] = np.insert(poss[k],0, env.sim.data.qpos[0] )
                
                ''' See where different actions would take us, evaluate value function there '''
                v = np.zeros(N_a)
                for it in range(1000):
                    for i in range(N_a):
                        x = env.step(actions[i])[0]
                        v[i] += u(xi[it],x)
                        env.set_state(poss[k], vels[k])
                        
                v = v/1000 # to get mean of value function evaluations
                        
                ''' Pick action which maximises value function (plus noise) at the new location '''
                v_with_noise = v+sigma*np.random.normal(np.zeros(N_a),np.ones(N_a)) 
                a_argmax_with_noise = np.argmax(v_with_noise)

                ''' Check if action was optimal according to true policy; move according to learned behaviour  '''
                if a_argmax_with_noise != discrete_action_no_noise(policy(obs))[0]:
                    errors+=1
                else:
                    corrects+=1
                obs = env.step(actions[a_argmax_with_noise])[0] 

            print('\nErrorrate = ',(errors/(errors+corrects)))
#             input("Press Enter to start visualisation...")

            ''' Visualise the cheetah '''
            env.set_state(poss[0], vels[0])
            start = env.sim.data.qpos[0]       
            for k in range(100):
#                 env.render()
                env.set_state(poss[k], vels[k])

            print('\nDistance covered by Cheetah = ' + str(env.sim.data.qpos[0]-start))    
            distances[1].append(env.sim.data.qpos[0]-start)
        
print('\npCN done, '+ str(time.ctime()))






""" load samples - pCNL """

method = 'pCNL'
N_data = 100
N_trunc = 5

xi = []
for it in range(1000):
    xi.append(np.load('np_saved/HC/samples_policy_learning/KL_'+str(N_trunc)+'_'+method+'_NData'+str(N_data)+'_sampleNo'+str(it)+'.npy'))
                    
        
""" run tests - pCNL """ 

v_a = np.zeros(N_a)
v_a_with_noise = np.zeros(N_a)
    
print('\n\nTest started: ' + str(time.ctime()))
with tf.Session():
    tf_util.initialize()   
    env = gym.make('HalfCheetah-v2')
    env.unwrapped
    
    for _ in range(10):
        for N_data in [100]:
            obs = env.reset()
            errors = 0  # to keep track of how many moves would be the same if using true policy
            corrects = 0
            
            ''' Store values taken to reproduce behaviour '''
            poss = []
            vels = []
            for k in range(100):
                progressBar(k+1,100)

                poss.append(obs[0:8])
                vels.append(obs[8:17])
                poss[k] = np.insert(poss[k],0, env.sim.data.qpos[0] )
                
                ''' See where different actions would take us, evaluate value function there '''
                v = np.zeros(N_a)
                for it in range(1000):
                    for i in range(N_a):
                        x = env.step(actions[i])[0]
                        v[i] += u(xi[it],x)
                        env.set_state(poss[k], vels[k])
                        
                v = v/1000 # to get mean of value function evaluations
                        
                ''' Pick action which maximises value function (plus noise) at the new location '''
                v_with_noise = v+sigma*np.random.normal(np.zeros(N_a),np.ones(N_a)) 
                a_argmax_with_noise = np.argmax(v_with_noise)

                ''' Check if action was optimal according to true policy; move according to learned behaviour  '''
                if a_argmax_with_noise != discrete_action_no_noise(policy(obs))[0]:
                    errors+=1
                else:
                    corrects+=1
                obs = env.step(actions[a_argmax_with_noise])[0] 

            print('\nErrorrate = ',(errors/(errors+corrects)))
#             input("Press Enter to start visualisation...")

            ''' Visualise the cheetah '''
            env.set_state(poss[0], vels[0])
            start = env.sim.data.qpos[0]       
            for k in range(100):
#                 env.render()
                env.set_state(poss[k], vels[k])

            print('\nDistance covered by Cheetah = ' + str(env.sim.data.qpos[0]-start))    
            distances[2].append(env.sim.data.qpos[0]-start)
        
print('\npCNL done, '+ str(time.ctime()))


np.save('KL_dist.npy',distances)

Test started: Wed Jul 22 09:35:37 2020
Instructions for updating:
Please use tf.global_variables instead.
Instructions for updating:
Use `tf.variables_initializer` instead.
Distance covered by Cheetah = 8.392748221375138

Distance covered by Cheetah = 5.74872941781014

Distance covered by Cheetah = 7.7337769372504495

Distance covered by Cheetah = 7.63155458302224

Distance covered by Cheetah = 8.13810043245648

Distance covered by Cheetah = 8.273789423876526

Distance covered by Cheetah = 7.86996364533461

Distance covered by Cheetah = 8.126590412484392

Distance covered by Cheetah = 7.020926608952754

Distance covered by Cheetah = 7.743399399117183

optimal policy done


Test started: Wed Jul 22 09:35:43 2020
Percent: [--------------------------------------->] 100%
Errorrate =  0.63

Distance covered by Cheetah = 5.446614594725089
Percent: [--------------------------------------->] 100%
Errorrate =  0.62

Distance covered by Cheetah = 5.201841037455607
Percent: [---------------------

In [5]:
"""

TEST 2 - check how often mean of posterior function predicts the correct value

"""  


""" load samples - pCN """

method = 'pCN'
N_data = 100  
N_trunc = 5

xi = []
for it in range(1000):
    xi.append(np.load('np_saved/HC/samples_policy_learning/KL_'+str(N_trunc)+'_'+method+'_NData'+str(N_data)+'_sampleNo'+str(it)+'.npy'))


""" run tests """

x_test_100 = np.load('HC_x_test_100.npy')
a_test_100 = np.load('HC_a_test_100.npy')
optimal_choice = 0
not_optimal_choice = 0

start = time.time()
for j in range(100):
    v = np.zeros(N_a)
    
    for it in range(1000):
        for i in range(N_a):
            v[i] += u(xi[it],x_test_100[i,:,j])
            
    if a_test_100[j]==np.argmax(v):
        optimal_choice += 1
    else:
        not_optimal_choice += 1
            
print('Correct choices: ',optimal_choice/(optimal_choice+not_optimal_choice))
print('Wrong choices: ',not_optimal_choice/(optimal_choice+not_optimal_choice))
print('Time: ',time.time()-start)
print(str(time.ctime()))

Correct choices:  0.2
Wrong choices:  0.8
Time:  29.1163969039917
Wed Jul 22 10:12:48 2020
