In [None]:
import numpy as np
import pickle
from KL_uncertainity_evaluator import Robust_pol_Kl_uncertainity
import time

with open('rew_Cartpole.pkl', 'rb') as f:
    R = pickle.load(f)
f.close()

with open('constraint_cost_Cartpole.pkl', 'rb') as f:
    C = pickle.load(f)
f.close()

R = R +np.ones_like(R.shape)*0.1

In [None]:
def softmax(z):
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z)

def get_policy_from_theta(theta):
    nS, nA = theta.shape
    return np.array([softmax(theta[s]) for s in range(nS)])

def kl_divergence(pi_new, pi_old):
    kl = 0.0
    for s in range(len(pi_new)):
        kl += np.sum(pi_new[s] * (np.log(pi_new[s] + 1e-8) - np.log(pi_old[s] + 1e-8)))
    return kl

def flatten_grad(grad):
    return grad.flatten()

def reshape_grad(vec, shape):
    return vec.reshape(shape)

def natural_gradient_update(theta, grad, kl_lambda, alpha,ch_dep,ch,booster):
    # Perform a natural gradient-like update based on the objective:
    # max_\theta_new grad^T (\theta_new - \theta) - \lambda KL(\theta_new || \theta)
    # This is equivalent to solving: \theta_new = \theta + 1/\lambda * grad
    alpha = alpha*booster
    if(ch_dep==0):
        return theta - alpha * grad / kl_lambda
    elif(ch==1):
        if(ch==0):
           return theta + alpha * grad / kl_lambda
        else:
            return theta - alpha * grad / kl_lambda
    else:
        return theta + alpha * grad / kl_lambda

In [None]:
env_dep = 1 # similar to river-swim type
nS,nA = R.shape
cost_list = [R, C]
init_dist = np.exp(np.random.normal(0,1,nS))
init_dist = init_dist/np.sum(init_dist)
init_dist = init_dist.tolist()
rpe = Robust_pol_Kl_uncertainity(nS, nA, cost_list, init_dist, alpha=0.000001)
P = np.zeros((nA,nS,nS))
for s in range(nS):
  for a in range(nA):
    mu,sigma = np.random.uniform(0,100),np.random.uniform(0,100)
    P[a,s,:] = np.random.normal(mu,sigma,nS)
    P[a,s,:] = np.exp(P[a,s,:])
    P[a,s,:] = P[a,s,:]/np.sum(P[a,s,:])

In [None]:
C_KL = 0.5
kl_lambda = 5
alpha = 0.5
T = 10
b = 90
booster = 1
theta_old = np.random.randn(nS, nA)
theta = np.copy(theta_old)

In [None]:
vf = []
cf = []
start = time.time()
for t in range(T):
    policy = get_policy_from_theta(theta)

    # Get both objectives and gradients
    J_v, grad_v = rpe.evaluate_policy(policy, P, C_KL, n=0, t=t)
    J_c, grad_c = rpe.evaluate_policy(policy, P, C_KL, n=1, t=t)
    vf.append(J_v)
    cf.append(J_c)

    # Choose which gradient to follow
    if env_dep!=2:
        ch = np.argmax([J_v, kl_lambda*(np.max(J_c-b,0))])
    else:
        ch = np.argmax([J_v, kl_lambda*(np.max(b-J_c,0))])
    grad = grad_v if ch == 0 else grad_c

    # Flatten gradient and apply natural-like update
    grad_vec = flatten_grad(grad)
    theta_vec = flatten_grad(theta)
    theta_new_vec = natural_gradient_update(theta_vec, grad_vec, kl_lambda, alpha,env_dep,ch,booster)
    theta_new = reshape_grad(theta_new_vec, (nS, nA))

    # Check KL divergence
    pi_old = get_policy_from_theta(theta)
    pi_new = get_policy_from_theta(theta_new)
    kl = kl_divergence(pi_new, pi_old)

    # Accept the update
    theta = theta_new
    print(grad)
    print(f"[Iter {t}] J_v={J_v:.4f}, J_c={J_c:.4f}, KL={kl:.6f}, ch={ch}")

print("Time taken:",time.time()-start)

with open("Value_function_kl_lambda_CP"+str(kl_lambda)+".pkl","wb") as f:
    pickle.dump(vf,f)
f.close()

with open("Cost_function_kl_lambda_CP"+str(kl_lambda)+".pkl","wb") as f:
    pickle.dump(cf,f)
f.close()
# Final policy
#final_policy = get_policy_from_theta(theta)
#print("Final policy:")
#print(final_policy)


[[0.09201173 0.09201173]
 [0.09050018 0.09050018]
 [0.09161896 0.09161896]
 ...
 [0.09074648 0.09074648]
 [0.0902476  0.0902476 ]
 [0.0896808  0.0896808 ]]
[Iter 0] J_v=20.9859, J_c=116.9688, KL=0.000000, ch=1
[[0.09201173 0.09201173]
 [0.09050018 0.09050018]
 [0.09161896 0.09161896]
 ...
 [0.09074648 0.09074648]
 [0.0902476  0.0902476 ]
 [0.0896808  0.0896808 ]]
[Iter 1] J_v=20.9859, J_c=116.9688, KL=0.000000, ch=1


KeyboardInterrupt: 

In [None]:
print(get_policy_from_theta(theta))

[[0.91483864 0.08516136]
 [0.84496566 0.15503434]
 [0.34728545 0.65271455]
 ...
 [0.81128932 0.18871068]
 [0.7047552  0.2952448 ]
 [0.18485169 0.81514831]]


In [None]:
grad

array([[0.09164769, 0.09164769],
       [0.09105194, 0.09105194],
       [0.09180509, 0.09180509],
       ...,
       [0.09229802, 0.09229802],
       [0.09166756, 0.09166756],
       [0.09095145, 0.09095145]])