In [89]:
import argparse
from typing import Any, List, Callable

from GridWorld_environments import Grid_World
from RL_agents import ValueIterationAgent, QLearningAgent
from IRL_agents import IRL_from_sampled_trajectories

from sklearn.preprocessing import MinMaxScaler

import numpy as np
import matplotlib.pyplot as plt

from copy import deepcopy

#### Train Value Iteration function

In [90]:
def train_value_iteration(gw_env: Grid_World, verbose=False):
    vi_agent = ValueIterationAgent(states=gw_env.get_state_space(),
                                   terminal_states=gw_env.get_terminal_states(),
                                   reward_function=gw_env.get_reward_func(),
                                   actions=gw_env.get_action_space(),
                                   gamma=GAMMA)

    iters = 0
    while iters < VALUE_ITERATION_TRAINING_N and not vi_agent.value_converged:

        for state in gw_env.get_state_space():

            if state in gw_env.get_terminal_states():
                continue

            opt_act = vi_agent.get_optimal_action(action_state_pairs=gw_env.get_action_state_pairs(state=state))
            next_state = gw_env.get_new_state_on_action(old_state=state, action=opt_act)
            next_state_value = vi_agent.get_state_value(state=next_state)

            vi_agent.set_state_value(state=state, new_value=(gw_env.get_state_reward(state=next_state) + GAMMA * next_state_value))

        iters += 1

    if verbose:
        gw_env.display_value_function(value_func=vi_agent.get_value_function())

    vi_agent.construct_greedy_policy(gw_env.get_action_state_pairs)

    if verbose:
        gw_env.display_policy(policy=vi_agent.get_policy())

    return vi_agent.get_policy()


#### Train Q Learning function

In [99]:
def train_q_learning(gw_env: Grid_World, n_episodes=1000, verbose=False, policy="eps_greedy", eps=0.2, max_episode_len=100, gamma=0.95):
    ql_agent = QLearningAgent(states=gw_env.get_state_space(),
                              size=gw_env.get_board_shape(),
                              terminal_states=gw_env.get_terminal_states(),
                              reward_function=gw_env.get_reward_func(),
                              actions=gw_env.get_action_space(),
                              gamma=gamma)
    
    # init episodes
    episodes = []
    
    # Define state_space without terminal states for getting starting position
    state_space = deepcopy(gw_env.get_state_space()) # all states
    terminal_states = gw_env.get_terminal_states()
    for terminal_state in terminal_states:
        state_space.remove(terminal_state) # not non absorbing state_space
    
    # init state_visited_counter
    state_visited = {state: 0 for state in state_space} #np.zeros(gw_env.get_board_shape())
    
    #action_value_converged = False
    convergence_counter = 0
    
    for n in range(n_episodes):
        
        episode = []
        
        # random starting position
        states_not_visited = [ state for state in state_visited if state_visited[state] == 0 ]
        #print("||||||")
        #print("states_not_visited\n", states_not_visited)
        #print("^^^^^")
        if len(states_not_visited) > 0:
            start_idx = (np.random.choice(len(states_not_visited)))
            start = states_not_visited[start_idx]
        else:
            start_idx = (np.random.choice(len(state_space)))
            start = state_space[start_idx]
        
        state_visited[start] += 1
        
        episode.append(start)
        
        i = 0
        terminal = False
        
        old_q_val_func = ql_agent.get_Q_function(mat_repr=True)
        
        while ( ( i < max_episode_len ) and ( not terminal ) ):
            i += 1
            
            # Choose Action from S derived by given policy
            if policy == "eps_greedy":
                if np.random.uniform() < (1-eps):
                    # Choose greedy action -> highest Q-Value
                    chosen_action = ql_agent.get_greedy_action(episode[-1])
                else:
                    # Choose random action form action space
                    action_space = gw_env.get_action_space()
                    chosen_action = action_space[np.random.choice(len(action_space))]
            
            new_state = gw_env.get_new_state_on_action(episode[-1], chosen_action)
            
            # Reward is taken from Q_learning agent -> it knows the reward function from the environment
            ql_agent.update_Q_value(episode[-1], new_state, chosen_action)
            
            episode.append(new_state)
            
            if new_state in terminal_states:
                terminal = True
            else:
                # add to state visited counter for the new state if it is not terminal
                state_visited[new_state] += 1
                if (state_visited[new_state] >= 5):
                    state_visited[new_state] = 0
                    
        episodes.append(episode)
                    
        # essentially works nicely, but to be used carefully. States that will not be visited by the current policy
        # will only be visited, when the start is chosen by random choice in this state
        # TODO: Add logic, that chooses starts in states that have not been visited for X episodes
        # Check if Q-function did is close to the Q-function from the last episode
        if np.isclose( old_q_val_func, ql_agent.get_Q_function(mat_repr=True), atol=1e-08 ).all( ):
            #print("***")
            #print(old_q_val_func)
            #print(ql_agent.get_Q_function(mat_repr=True))
            #print("***")
            convergence_counter += 1
            #print("--------------")
            #print(f"episode {n}")
            #print("convergence_counter", convergence_counter)
            #print("--------------")
            if convergence_counter >= 50:
                break
        else:
            convergence_counter = 0

        
        
    if verbose:
        if n < n_episodes:
            print(f"It took {n} episodes to converge to the optimal Q-function")
        else:
            print(f"Did not converge to optimal Q-function in {n_episodes} episodes")
    
    if verbose:
        gw_env.display_q_function(q_func=ql_agent.get_Q_function())

    ql_agent.construct_greedy_policy(gw_env.get_action_state_pairs)

    if verbose:
        gw_env.display_policy(policy=ql_agent.get_policy())

    return ql_agent.get_policy()         


In [100]:
np.zeros(environment.get_board_shape())[(0,0)]
d = {'a':1, 'b':2, 'c':3, 'e': 5}
[ state for state in d if d[state] == 0 ]
#for x in d:
#    print(x)

[]

#### IRL Reward estimation function

In [101]:
def irl_reward_estimation(env: Grid_World, optimal_trajectories: List[List[Any]], train_func: Callable):

    # store reference reward function
    reward_func_ref = deepcopy(env.get_board())
    print('Reference reward function:\n', reward_func_ref)

    irl_agent = IRL_from_sampled_trajectories(d=(GW_SIZE[0] * 4, GW_SIZE[1] * 4),
                                              env_ranges=((0, GW_SIZE[0]), (0, GW_SIZE[1])),
                                              env_discrete_size=GW_SIZE,
                                              penalty_factor=2,
                                              gamma=GAMMA)

    # step 2: given optimal trajectories, compute the value estimate
    print("Computing value estimates for optimal trajectories...")
    optimal_value_estimate = irl_agent.compute_value_estimate(trajs=optimal_trajectories)

    candidate_policies = [env.construct_random_policy()]
    candidate_value_estimates = []
    reward_func_estimates = []

    # while True:
    for i in range(IRL_TRAINING_N):
        print(f"Iteration {i}...")

        # step 3: generate trajectories and compute the value estimate for a random policy
        print("Generating trajectories for the candidate policy...")
        candidate_trajectories = env.generate_trajectories(policy=candidate_policies[-1],
                                                           n_traj=NUMBER_OF_TRAJECTORIES,
                                                           max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)
        print("Computing value estimates for condidate trajectories...")
        candidate_value_estimates.append(irl_agent.compute_value_estimate(trajs=candidate_trajectories))

        # step 4: obtain new alphas
        print("Solving linear programming...")
        irl_agent.solve_lp(optimal_value_estimate, candidate_value_estimates)

        # step 5: construct new reward function from the alphas
        reward_func = irl_agent.construct_reward_function(alphas=irl_agent.get_alphas())

        # step 6: find optimal policy under new reward function and add to 'candidate_policies' list
        env.set_reward_func(reward_func)
        candidate_policies.append(train_func(gw_env=env, verbose=True))  # train_value_iteration(gw_env=env))
        # store new reward function
        reward_func_estimates.append(env.get_board())
        
        print("Latest estimated reward function:\n", reward_func_estimates[-1])
        env.display_policy(policy=candidate_policies[-1])
        print("============================================================\n" * 2)

    return {'reference_reward_func': reward_func_ref, 'policy_pred': np.mean(np.array([list(pol.values()) for pol in candidate_policies]), axis=0), 'avg_predicted_reward_func': np.mean(np.array(reward_func_estimates), axis=0)}


def calc_value_distance(value_estimates_ref, value_estimates_pred):
    return np.linalg.norm(np.array(value_estimates_ref) - np.array(value_estimates_pred))

In [102]:
GAMMA = 0.95
VALUE_ITERATION_TRAINING_N = 25
IRL_TRAINING_N = 10

NUMBER_OF_TRAJECTORIES = 400
MAXIMUM_TRAJECTORY_LENGTH = 10

GW_SIZE = (3, 3)
GW_SIZES = [(3, 3)]  # [(x, x) for x in np.arange(5,11, 5)]
GW_TRAPS = []
GW_GOALS = [(0, 0)]

### Step-by-step code

In [103]:
environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)


## *****************

In [104]:
vi_greedy_policy = train_value_iteration(gw_env=environment, verbose=True)


Value function:
[[0.       1.       0.95    ]
 [1.       0.95     0.9025  ]
 [0.95     0.9025   0.857375]]
Policy:
[['x' '<' '<']
 ['^' '<' '<']
 ['^' '<' '<']]


## *****************

In [97]:
ql_greedy_policy = train_q_learning(gw_env=environment, verbose=True)


--------------
episode 4
convergence_counter 1
--------------
--------------
episode 5
convergence_counter 2
--------------
--------------
episode 10
convergence_counter 1
--------------
--------------
episode 11
convergence_counter 2
--------------
--------------
episode 12
convergence_counter 3
--------------
--------------
episode 15
convergence_counter 1
--------------
--------------
episode 18
convergence_counter 1
--------------
--------------
episode 19
convergence_counter 2
--------------
--------------
episode 20
convergence_counter 3
--------------
--------------
episode 21
convergence_counter 4
--------------
--------------
episode 22
convergence_counter 5
--------------
--------------
episode 23
convergence_counter 6
--------------
--------------
episode 24
convergence_counter 7
--------------
--------------
episode 26
convergence_counter 1
--------------
--------------
episode 27
convergence_counter 2
--------------
--------------
episode 28
convergence_counter 3
---------

convergence_counter 50
--------------
It took 209 episodes to converge to the optimal Q-function
Q function:
[[[0.         0.         0.         0.        ]
  [0.9025     0.9025     0.95       1.        ]
  [0.857375   0.9025     0.9025     0.95      ]]

 [[0.9025     0.9025     1.         0.95      ]
  [0.857375   0.857375   0.95       0.95      ]
  [0.81450625 0.857375   0.9025     0.9025    ]]

 [[0.67310681 0.857375   0.95       0.9025    ]
  [0.857375   0.81450625 0.9025     0.9025    ]
  [0.81450625 0.67095911 0.857375   0.857375  ]]]
Policy:
[['x' '<' '<']
 ['^' '^' '^']
 ['^' '^' '^']]


In [98]:
print(f"Generating {NUMBER_OF_TRAJECTORIES} trajectories...")

greedy_policy = vi_greedy_policy
# greedy_policy = ql_greedy_policy

trajectories = environment.generate_trajectories(policy=greedy_policy,
                                                 n_traj=NUMBER_OF_TRAJECTORIES,
                                                 max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)


Generating 400 trajectories...


In [None]:
print("IRL from samples...")

# restart the environment
environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)

train_func = train_value_iteration
# train_func = train_q_learning

estimated_rewards = irl_reward_estimation(env=environment, optimal_trajectories=trajectories, train_func=train_func)
ref_reward_funcs.append(estimated_rewards['reference_reward_func'])
avg_pred_reward_funcs.append(estimated_rewards['avg_predicted_reward_func'])
# Using default value for reward loss -> Frobenius for matrices and L2-loss for vectors
reward_loss.append(np.linalg.norm(estimated_rewards['reference_reward_func'] - estimated_rewards['avg_predicted_reward_func']))
# Using L1-Loss for policy loss as described by Ng and Russel in 2000
policy_loss.append(np.linalg.norm(estimated_rewards['policy_pred'] - np.array(list(greedy_policy.values())), ord=1 ))

print('**********************************************')
print('*****************REWARD LOSS******************')
print(reward_loss)
print('**********************************************')
print('*****************POLICY LOSS*************************')
print(policy_loss)
print('**********************************************')


### End-to-end loop

In [None]:
vi = True
ql = False
gt = True
irl = True
plt = False

print("configuration is:")
if vi:
    print("training: value iteration")
if ql:
    print("training: Q-Learning")
if gt:
    print("generating trajectories")
if irl:
    print("inverse reinforcment learning")
if plt:
    print("creating plots")
    
print("")

ref_reward_funcs = []
avg_pred_reward_funcs = []
reward_loss = []
policy_loss = []

for GW_SIZE in GW_SIZES:
    environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)

    train_func = train_value_iteration

    if vi:
        print("Training via value iteration...")
        greedy_policy = train_value_iteration(gw_env=environment, verbose=True)
    elif ql:
        print("Training via q-learning...")
        greedy_policy = train_q_learning(gw_env=environment, verbose=True)
        train_func = train_q_learning
    else:
        # load from file (?)
        greedy_policy = {}

    if gt:
        print(f"Generating {NUMBER_OF_TRAJECTORIES} trajectories...")
        trajectories = environment.generate_trajectories(policy=greedy_policy,
                                                         n_traj=NUMBER_OF_TRAJECTORIES,
                                                         max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)

    if irl:
        print("IRL from samples...")
        estimated_rewards = irl_reward_estimation(env=environment, optimal_trajectories=trajectories, train_func=train_func)
        ref_reward_funcs.append(estimated_rewards['reference_reward_func'])
        avg_pred_reward_funcs.append(estimated_rewards['avg_predicted_reward_func'])
        # Using default value for reward loss -> Frobenius for matrices and L2-loss for vectors
        reward_loss.append(np.linalg.norm(estimated_rewards['reference_reward_func'] - estimated_rewards['avg_predicted_reward_func']))
        # Using L1-Loss for policy loss as described by Ng and Russel in 2000
        policy_loss.append(np.linalg.norm(estimated_rewards['policy_pred'] - np.array(list(greedy_policy.values())), ord=1 ))

        print('**********************************************')
        print('*****************REWARD LOSS******************')
        print(reward_loss)
        print('**********************************************')
        print('*****************POLICY LOSS*************************')
        print(policy_loss)
        print('**********************************************')

print('reward_loss \n', reward_loss)
plt.plot(reward_loss)
plt.savefig('reward_loss.png')

print("Closing up the arena...")