In [2]:
import argparse
from typing import Any, List, Callable

from GridWorld_environments import Grid_World
from RL_agents import ValueIterationAgent, QLearningAgent
from IRL_agents import IRL_from_sampled_trajectories

from sklearn.preprocessing import MinMaxScaler

import numpy as np
import matplotlib.pyplot as plt

from copy import deepcopy

In [3]:
def train_value_iteration(gw_env: Grid_World, verbose=False):
    vi_agent = ValueIterationAgent(states=gw_env.get_state_space(),
                                   terminal_states=gw_env.get_terminal_states(),
                                   reward_function=gw_env.get_reward_func(),
                                   actions=gw_env.get_action_space(),
                                   gamma=GAMMA)

    iters = 0
    while iters < VALUE_ITERATION_TRAINING_N and not vi_agent.converged:

        for state in gw_env.get_state_space():

            if state in gw_env.get_terminal_states():
                continue

            opt_act = vi_agent.get_optimal_action(action_state_pairs=gw_env.get_action_state_pairs(state=state))
            next_state = gw_env.get_new_state_on_action(old_state=state, action=opt_act)
            next_state_value = vi_agent.get_state_value(state=next_state)

            vi_agent.set_state_value(state=state, new_value=(gw_env.get_state_reward(state=next_state) + GAMMA * next_state_value))

        iters += 1
        # print(f"Iteration {iters}")
        # print(vi_agent.get_value_function())

    # print("Board:")
    # print(gw_env.get_board())

    if verbose:
        gw_env.display_value_function(value_func=vi_agent.get_value_function())

    vi_agent.construct_policy(gw_env.get_action_state_pairs)

    if verbose:
        gw_env.display_policy(policy=vi_agent.get_policy())

    return vi_agent.get_policy()


def train_q_learning(gw_env: Grid_World, verbose=False):
    ql_agent = QLearningAgent(states=gw_env.get_state_space(),
                              terminal_states=gw_env.get_terminal_states(),
                              reward_function=gw_env.get_reward_func(),
                              actions=gw_env.get_action_space(),
                              gamma=GAMMA)

    iters = 0
    while iters < VALUE_ITERATION_TRAINING_N and not ql_agent.converged:

        for state in gw_env.get_state_space():

            if state in gw_env.get_terminal_states():
                continue

            opt_act = ql_agent.get_optimal_action(state, action_state_pairs=gw_env.get_action_state_pairs(state=state))
            next_state = gw_env.get_new_state_on_action(old_state=state, action=opt_act)
            next_q_value = ql_agent.get_state_action_value(state=next_state, action=opt_act)

            ql_agent.set_state_action_value(state=state, action=opt_act, new_value=(gw_env.get_state_reward(state=next_state) + GAMMA * next_q_value))

        iters += 1
        # print(f"Iteration {iters}")
        # print(vi_agent.get_value_function())

    # print("Board:")
    # print(gw_env.get_board())

    if verbose:
        gw_env.display_q_function(q_func=ql_agent.get_Q_function())

    ql_agent.construct_policy(gw_env.get_action_state_pairs)

    if verbose:
        gw_env.display_policy(policy=ql_agent.get_policy())

    return ql_agent.get_policy()


def irl_reward_estimation(env: Grid_World, optimal_trajectories: List[List[Any]], train_func: Callable):

    # prepare reference reward function
    reward_func_ref = deepcopy(env.get_board())
    reward_func_preds = []
    print('reward_func_ref\n', reward_func_ref)

    minmax_scaler = MinMaxScaler(feature_range=(-1, 1))

    reward_func_ref_shape = reward_func_ref.shape
    reward_func_ref = minmax_scaler.fit_transform(reward_func_ref.reshape(-1, 1)).reshape(reward_func_ref_shape)
    print('MinMax scaled reward_func_ref:\n', reward_func_ref)

    irl_agent = IRL_from_sampled_trajectories(d=(GW_SIZE[0] * 4, GW_SIZE[1] * 4),
                                              env_ranges=((0, GW_SIZE[0]), (0, GW_SIZE[1])),
                                              env_discrete_size=GW_SIZE,
                                              penalty_factor=2,
                                              gamma=GAMMA)

    # step 2: given optimal trajectories, compute the value estimate
    optimal_value_estimate = irl_agent.compute_value_estimate(trajs=optimal_trajectories)

    candidate_policies = [env.construct_random_policy()]
    candidate_value_estimates = []

    # while True:
    for i in range(IRL_TRAINING_N):
        print(f"Iteration {i}...")

        # step 3: generate trajectories and compute the value estimate for a random policy
        candidate_trajectories = env.generate_trajectories(policy=candidate_policies[-1],
                                                           n_traj=NUMBER_OF_TRAJECTORIES,
                                                           max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)
        candidate_value_estimates.append(irl_agent.compute_value_estimate(trajs=candidate_trajectories))

        # step 4: obtain new alphas
        irl_agent.solve_lp(optimal_value_estimate, candidate_value_estimates)

        # step 5: construct new reward function from the alphas
        reward_func = irl_agent.construct_reward_function(alphas=irl_agent.get_alphas())

        # step 6: find optimal policy under new reward function and add to 'candidate_policies' list
        env.set_reward_func(reward_func)
        print("Latest non-Scaled reward func:\n", env.get_board())
        scaled_board = minmax_scaler.fit_transform(env.get_board().reshape(-1, 1)).reshape(env.get_board().shape)
        env.set_board(new_board=scaled_board)

        reward_func_preds.append(env.get_board())

        candidate_policies.append(train_func(gw_env=env, verbose=True))  # train_value_iteration(gw_env=env))

        print("Latest Scaled reward func:\n", reward_func_preds[-1])
        env.display_policy(policy=candidate_policies[-1])
        print("============================================================\n" * 2)

    # print('reward_func_pred \n', [np.array(reward_func_pred).flatten() for reward_func_pred in reward_func_preds]) #[np.array(one_candidate_value_estimates).flatten().shape for one_candidate_value_estimates in candidate_value_estimates ] )
    # print('reward_func_ref \n', np.array(reward_func_ref).flatten())
    # vec1 = [np.array(reward_func_pred).flatten() for reward_func_pred in reward_func_preds]
    # vec2 = np.array(reward_func_ref).flatten()
    # print('l2-loss', np.linalg.norm(vec1[0] - vec2))
    #reward_loss = [ np.linalg.norm(np.array(reward_func_ref).flatten() - np.array(reward_func_pred).flatten()) for reward_func_pred in reward_func_preds ]

    #value_loss = [ calc_value_distance(optimal_value_estimate, one_candidate_value_estimates) for one_candidate_value_estimates in candidate_value_estimates ]
    # plt.plot(reward_loss)
    # plt.show()

    return {'reference_reward_func': reward_func_ref, 'policy_pred': np.mean(np.array([ np_normalize(list(pol.values()), 1) for pol in candidate_policies ]), axis=0), 'avg_predicted_reward_func': np.mean(np.array(reward_func_preds), axis=0)}


def calc_value_distance(value_estimates_ref, value_estimates_pred):
    return np.linalg.norm(np.array(value_estimates_ref) - np.array(value_estimates_pred))


In [8]:
GAMMA = 0.95
VALUE_ITERATION_TRAINING_N = 25
IRL_TRAINING_N = 5

NUMBER_OF_TRAJECTORIES = 40
MAXIMUM_TRAJECTORY_LENGTH = 50

GW_SIZE = (4, 6)
GW_SIZES = [(4, 6)]  # [(x, x) for x in np.arange(5,11, 5)]
GW_TRAPS = [(1, 2)]
GW_GOALS = [(3, 5)]

### Step-by-step code

In [9]:
environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)


In [10]:
vi_greedy_policy = train_value_iteration(gw_env=environment, verbose=True)


Value function:
[[0.6983373  0.73509189 0.77378094 0.81450625 0.857375   0.9025    ]
 [0.73509189 0.77378094 0.         0.857375   0.9025     0.95      ]
 [0.77378094 0.81450625 0.857375   0.9025     0.95       1.        ]
 [0.81450625 0.857375   0.9025     0.95       1.         0.        ]]
Policy:
[['v' 'v' '>' 'v' 'v' 'v']
 ['v' 'v' 'x' 'v' 'v' 'v']
 ['v' 'v' 'v' 'v' 'v' 'v']
 ['>' '>' '>' '>' '>' 'x']]


In [11]:
ql_greedy_policy = train_q_learning(gw_env=environment, verbose=True)


Q function:
[[[ 0.66342043  0.6983373   0.66342043  0.66342043]
  [ 0.6983373   0.73509189  0.6983373   0.66342043]
  [-1.          0.77378094  0.73509189  0.6983373 ]
  [ 0.77378094  0.81450625  0.77378094  0.73509189]
  [ 0.81450625  0.857375    0.81450625  0.77378094]
  [ 0.9025      0.857375    0.857375    0.81450625]]

 [[ 0.6983373   0.6983373   0.66342043  0.6983373 ]
  [ 0.73509189 -1.          0.6983373   0.66342043]
  [ 0.          0.          0.          0.        ]
  [ 0.81450625  0.857375    0.77378094 -1.        ]
  [ 0.857375    0.9025      0.81450625  0.81450625]
  [ 0.95        0.9025      0.857375    0.857375  ]]

 [[ 0.73509189  0.77378094  0.66342043  0.73509189]
  [ 0.77378094  0.81450625  0.6983373   0.73509189]
  [ 0.81450625  0.857375   -1.          0.77378094]
  [ 0.857375    0.9025      0.81450625  0.81450625]
  [ 0.9025      0.95        0.857375    0.857375  ]
  [ 1.          0.95        0.9025      0.9025    ]]

 [[ 0.77378094  0.81450625  0.73509189  0.7737

In [12]:
print(f"Generating {NUMBER_OF_TRAJECTORIES} trajectories...")

greedy_policy = vi_greedy_policy
# greedy_policy = ql_greedy_policy

trajectories = environment.generate_trajectories(policy=greedy_policy,
                                                 n_traj=NUMBER_OF_TRAJECTORIES,
                                                 max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)


Generating 40 trajectories...


In [14]:
print("IRL from samples...")

train_func = train_value_iteration
# train_func = train_q_learning

estimated_rewards = irl_reward_estimation(env=environment, optimal_trajectories=trajectories, train_func=train_func)
ref_reward_funcs.append(estimated_rewards['reference_reward_func'])
avg_pred_reward_funcs.append(estimated_rewards['avg_predicted_reward_func'])
# Using default value for reward loss -> Frobenius for matrices and L2-loss for vectors
reward_loss.append(np.linalg.norm(estimated_rewards['reference_reward_func'] - estimated_rewards['avg_predicted_reward_func']))
# Using L1-Loss for policy loss as described by Ng and Russel in 2000
policy_loss.append(np.linalg.norm(estimated_rewards['policy_pred'] - np.array(list(greedy_policy.values())), ord=1 ))

print('**********************************************')
print('*****************REWARD LOSS******************')
print(reward_loss)
print('**********************************************')
print('*****************POLICY LOSS*************************')
print(policy_loss)
print('**********************************************')


IRL from samples...
reward_func_ref
 [[ 0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.]]
MinMax scaled reward_func_ref:
 [[ 0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.]]
Iteration 0...
Latest non-Scaled reward func:
 [[-3.98524428 -5.21657998 -6.05985019 -6.30720925 -5.89911348 -4.93425745]
 [-4.91908932 -6.43895859 -7.47982866 -7.78515029 -7.28142721 -6.09048064]
 [-5.2084129  -6.81767555 -7.91976594 -8.24304552 -7.70969522 -6.44870135]
 [-4.73488853 -6.1978446  -7.19973812 -7.49362664 -7.00876604 -5.8624158 ]]
Value function:
[[15.28210306 15.51799791 15.1637076  14.43102586 13.61878722 13.03885371]
 [15.51799791 15.3034467   0.         13.61878722 13.03885371 12.94113485]
 [15.3034467  14.96372025 13.88506759 12.34266704 11.83860816 12.30519382]
 [14.96372025 14.86340683 14.08092057 12.86694316 11.57561755  0.        ]]
Policy:
[['<' '<' '<' '<' '<' '<']
 ['^' '<' 'x'

NameError: name 'np_normalize' is not defined

### End-to-end loop

In [4]:
vi = True
ql = False
gt = True
irl = True
plt = False

print("configuration is:")
if vi:
    print("training: value iteration")
if ql:
    print("training: Q-Learning")
if gt:
    print("generating trajectories")
if irl:
    print("inverse reinforcment learning")
if plt:
    print("creating plots")
    
print("")

#print(f"Passed args: {args}")

ref_reward_funcs = []
avg_pred_reward_funcs = []
reward_loss = []
policy_loss = []

for GW_SIZE in GW_SIZES:
    environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)

    train_func = train_value_iteration

    if vi:
        print("Training via value iteration...")
        greedy_policy = train_value_iteration(gw_env=environment, verbose=True)
    elif ql:
        print("Training via q-learning...")
        greedy_policy = train_q_learning(gw_env=environment, verbose=True)
        train_func = train_q_learning
    else:
        # load from file (?)
        greedy_policy = {}

    if gt:
        print(f"Generating {NUMBER_OF_TRAJECTORIES} trajectories...")
        trajectories = environment.generate_trajectories(policy=greedy_policy,
                                                         n_traj=NUMBER_OF_TRAJECTORIES,
                                                         max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)

    if irl:
        print("IRL from samples...")
        estimated_rewards = irl_reward_estimation(env=environment, optimal_trajectories=trajectories, train_func=train_func)
        ref_reward_funcs.append(estimated_rewards['reference_reward_func'])
        avg_pred_reward_funcs.append(estimated_rewards['avg_predicted_reward_func'])
        # Using default value for reward loss -> Frobenius for matrices and L2-loss for vectors
        reward_loss.append(np.linalg.norm(estimated_rewards['reference_reward_func'] - estimated_rewards['avg_predicted_reward_func']))
        # Using L1-Loss for policy loss as described by Ng and Russel in 2000
        policy_loss.append(np.linalg.norm(estimated_rewards['policy_pred'] - np.array(list(greedy_policy.values())), ord=1 ))

        print('**********************************************')
        print('*****************REWARD LOSS******************')
        print(reward_loss)
        print('**********************************************')
        print('*****************POLICY LOSS*************************')
        print(policy_loss)
        print('**********************************************')

print('reward_loss \n', reward_loss)
plt.plot(reward_loss)
plt.savefig('reward_loss.png')

print("Closing up the arena...")

configuration is:
training: value iteration
generating trajectories
inverse reinforcment learning

Training via value iteration...
Value function:
[[0.6983373  0.73509189 0.77378094 0.81450625 0.857375   0.9025    ]
 [0.73509189 0.77378094 0.         0.857375   0.9025     0.95      ]
 [0.77378094 0.81450625 0.857375   0.9025     0.95       1.        ]
 [0.81450625 0.857375   0.9025     0.95       1.         0.        ]]
Policy:
[['v' 'v' '>' 'v' 'v' 'v']
 ['v' 'v' 'x' 'v' 'v' 'v']
 ['v' 'v' 'v' 'v' 'v' 'v']
 ['>' '>' '>' '>' '>' 'x']]
Generating 40 trajectories...
IRL from samples...
reward_func_ref
 [[ 0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.]]
MinMax scaled reward_func_ref:
 [[ 0.  0.  0.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.]]
Iteration 0...
Latest non-Scaled reward func:
 [[-3.98524428 -5.21657998 -6.05985019 -6.30720925 -5.89911348 -4.93425745]
 [-4.91908932 -6.438

NameError: name 'np_normalize' is not defined