In [6]:
import argparse
from typing import Any, List, Callable, Dict

from GridWorld_environments import Grid_World
from RL_agents import ValueIterationAgent, QLearningAgent
from IRL_agents import IRL_from_sampled_trajectories

from sklearn.preprocessing import MinMaxScaler

import numpy as np
import matplotlib.pyplot as plt

from copy import deepcopy

import os
import pickle

#### Train Value Iteration function

In [2]:
def train_value_iteration(gw_env: Grid_World, verbose=False):
    vi_agent = ValueIterationAgent(states=gw_env.get_state_space(),
                                   terminal_states=gw_env.get_terminal_states(),
                                   reward_function=gw_env.get_reward_func(),
                                   actions=gw_env.get_action_space(),
                                   gamma=GAMMA)

    iters = 0
    while iters < VALUE_ITERATION_TRAINING_N and not vi_agent.value_converged:

        for state in gw_env.get_state_space():

            if state in gw_env.get_terminal_states():
                continue

            opt_act = vi_agent.get_optimal_action(action_state_pairs=gw_env.get_action_state_pairs(state=state))
            next_state = gw_env.get_new_state_on_action(old_state=state, action=opt_act)
            next_state_value = vi_agent.get_state_value(state=next_state)

            vi_agent.set_state_value(state=state, new_value=(gw_env.get_state_reward(state=next_state) + GAMMA * next_state_value))

        iters += 1

    if verbose:
        gw_env.display_value_function(value_func=vi_agent.get_value_function())

    vi_agent.construct_greedy_policy(gw_env.get_action_state_pairs)

    if verbose:
        gw_env.display_policy(policy=vi_agent.get_policy())

    return vi_agent.get_policy()


#### Train Q Learning function

In [45]:
def train_q_learning(gw_env: Grid_World, n_episodes=5000, verbose=False, policy="eps_greedy", eps=0.2, max_episode_len=100, gamma=0.95):
    ql_agent = QLearningAgent(states=gw_env.get_state_space(),
                              size=gw_env.get_board_shape(),
                              terminal_states=gw_env.get_terminal_states(),
                              reward_function=gw_env.get_reward_func(),
                              actions=gw_env.get_action_space(),
                              gamma=gamma)
    
    # init episodes
    episodes = []
    
    # Define state_space without terminal states for getting starting position
    state_space = deepcopy(gw_env.get_state_space()) # all states
    
    board_size = gw_env.get_board_shape()
    total_states = board_size[0] * board_size[1]
    n_episodes = total_states * 500
    
    # Number 15 is empirically determined.
    # For a 3x3 Grid the total states are 9 and we checked, that at least 100 states are required to produce reasonably reliable results
    # So 9 * x >= 100 yields that x >= 10
    # Now we also added a buffer and therefore chose 15
    convergence_criterion = total_states * 30
    
    terminal_states = gw_env.get_terminal_states()
    for terminal_state in terminal_states:
        state_space.remove(terminal_state) # not non absorbing state_space
    
    # init state_visited_counter
    state_visited = {state: 4 for state in state_space}
    
    #action_value_converged = False
    convergence_counter = 0
    
    for n in range(n_episodes):
        
        episode = []
        
        # reset if every state has been visited at least 4 times (for each action)
        if ( (np.array(list(state_visited.values())) <= 0).all() ):
            state_visited = {state: 4 for state in state_space}
        
        # random starting position
        states_not_visited = [ state for state in state_visited if state_visited[state] > 0 ]
        if len(states_not_visited) > 0:
            start_idx = (np.random.choice(len(states_not_visited)))
            start = states_not_visited[start_idx]
        else:
            start_idx = (np.random.choice(len(state_space)))
            start = state_space[start_idx]
        
        state_visited[start] -= 1
        
        episode.append(start)
        
        i = 0
        terminal = False
        
        old_q_val_func = ql_agent.get_Q_function(mat_repr=True)
        
        while ( ( i < max_episode_len ) and ( not terminal ) ):
            i += 1
            
            # Choose Action from S derived by given policy
            if policy == "eps_greedy":
                if np.random.uniform() < (1-eps):
                    # Choose greedy action -> highest Q-Value
                    chosen_action = ql_agent.get_greedy_action(episode[-1])
                else:
                    # Choose random action form action space
                    action_space = gw_env.get_action_space()
                    chosen_action = action_space[np.random.choice(len(action_space))]
            
            new_state = gw_env.get_new_state_on_action(episode[-1], chosen_action)
            
            # Reward is taken from Q_learning agent -> it knows the reward function from the environment
            ql_agent.update_Q_value(episode[-1], new_state, chosen_action)
            
            episode.append(new_state)
            
            if new_state in terminal_states:
                terminal = True
            else:
                # add to state visited counter for the new state if it is not terminal
                state_visited[new_state] -= 1
                #if (state_visited[new_state] >= 5):
                    #state_visited[new_state] = 0
                    
        episodes.append(episode)
                    
        # essentially works nicely, but to be used carefully. States that will not be visited by the current policy
        # will only be visited, when the start is chosen by random choice in this state
        # -> Fixed by: rarely visited states will be preferred for the choice of the start
        
        # Check if Q-function did is close to the Q-function from the last episode
        if np.isclose( old_q_val_func, ql_agent.get_Q_function(mat_repr=True), atol=1e-08 ).all( ):
            convergence_counter += 1
            
            # Comment in print statements to see how the episodes develop until convergence
            #print("--------------")
            #print(f"episode {n}")
            #print("convergence_counter", convergence_counter)
            #print("--------------")
            
            if convergence_counter >= convergence_criterion:
                break
        else:
            convergence_counter = 0

        
        
    if verbose:
        if n < n_episodes:
            print(f"It took {n} episodes to converge to the optimal Q-function")
        else:
            print(f"Did not converge to optimal Q-function in {n_episodes} episodes")
    
    if verbose:
        gw_env.display_q_function(q_func=ql_agent.get_Q_function())

    ql_agent.construct_greedy_policy(gw_env.get_action_state_pairs)

    if verbose:
        gw_env.display_policy(policy=ql_agent.get_policy())

    return ql_agent.get_policy()       


#### Perform Action-Value evaluation function (Q-Learning evaluation)

In [4]:
def perform_value_evaluation(gw_env: Grid_World, policy: Dict[Any, Any], verbose=False):
    
    vi_agent = ValueIterationAgent(states=gw_env.get_state_space(),
                                   terminal_states=gw_env.get_terminal_states(),
                                   reward_function=gw_env.get_reward_func(),
                                   actions=gw_env.get_action_space(),
                                   gamma=GAMMA)
    
    while not vi_agent.value_converged:
        
        for state in gw_env.get_state_space():
            
            if state in gw_env.get_terminal_states():
                continue
                
            policy_act = policy[state]
            next_state = gw_env.get_new_state_on_action(old_state=state, action=policy_act)
            next_state_value = vi_agent.get_state_value(state=next_state)
            
            vi_agent.set_state_value(state=state, new_value=(gw_env.get_state_reward(state=next_state) + GAMMA * next_state_value))
        
    if verbose:
        gw_env.display_value_function(value_func=vi_agent.get_value_function())

    return vi_agent.get_value_function()
            

#### IRL Reward estimation function

In [46]:
def irl_reward_estimation(env: Grid_World, optimal_trajectories: List[List[Any]], train_func: Callable):

    # store reference reward function
    reward_func_ref = deepcopy(env.get_board())
    print('Reference reward function:\n', reward_func_ref)

    irl_agent = IRL_from_sampled_trajectories(d=(GW_SIZE[0] * 4, GW_SIZE[1] * 4),
                                              env_ranges=((0, GW_SIZE[0]), (0, GW_SIZE[1])),
                                              env_discrete_size=GW_SIZE,
                                              penalty_factor=2,
                                              gamma=GAMMA)

    # step 2: given optimal trajectories, compute the value estimate
    print("Computing value estimates for optimal trajectories...")
    optimal_value_estimate = irl_agent.compute_value_estimate(trajs=optimal_trajectories)

    candidate_policies = [env.construct_random_policy()]
    candidate_value_estimates = []
    reward_func_estimates = []

    # while True:
    for i in range(IRL_TRAINING_N):
        print(f"Iteration {i}...")

        # step 3: generate trajectories and compute the value estimate for a random policy
        print("Generating trajectories for the candidate policy...")
        candidate_trajectories = env.generate_trajectories(policy=candidate_policies[-1],
                                                           n_traj=NUMBER_OF_TRAJECTORIES,
                                                           max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)
        print("Computing value estimates for candidate trajectories...")
        candidate_value_estimates.append(irl_agent.compute_value_estimate(trajs=candidate_trajectories))

        # step 4: obtain new alphas
        print("Solving linear programming...")
        irl_agent.solve_lp(optimal_value_estimate, candidate_value_estimates)

        # step 5: construct new reward function from the alphas
        reward_func = irl_agent.construct_reward_function(alphas=irl_agent.get_alphas())

        # step 6: find optimal policy under new reward function and add to 'candidate_policies' list
        env.set_reward_func(reward_func)
        candidate_policies.append(train_func(gw_env=env, verbose=False))  # train_value_iteration(gw_env=env))
        # store new reward function
        reward_func_estimates.append(env.get_board())
        
        print("Latest estimated reward function:\n", reward_func_estimates[-1])
        env.display_policy(policy=candidate_policies[-1])
        print("============================================================\n" * 2)

    return {'environment': env, 'reference_reward_func': reward_func_ref, 'policy_pred': candidate_policies[-1], 'predicted_reward_func': reward_func_estimates[-1], 'avg_predicted_reward_func': np.mean(np.array(reward_func_estimates), axis=0)} #'policy_pred': np.mean(np.array([list(pol.values()) for pol in candidate_policies]), axis=0), 'avg_predicted_reward_func': np.mean(np.array(reward_func_estimates), axis=0)}


def calc_value_distance(value_estimates_ref, value_estimates_pred):
    return np.linalg.norm(np.array(value_estimates_ref) - np.array(value_estimates_pred))

In [47]:
GAMMA = 0.95
VALUE_ITERATION_TRAINING_N = 1000
IRL_TRAINING_N = 10

GW_SIZE = (3, 3)
GW_SIZES = [(3, 3)]  # [(x, x) for x in np.arange(5,11, 5)]
GW_TRAPS = []
GW_GOALS = [(0, 0)]

NUMBER_OF_STATES = GW_SIZE[0] * GW_SIZE[1]

NUMBER_OF_TRAJECTORIES = NUMBER_OF_STATES * 20 #400
MAXIMUM_TRAJECTORY_LENGTH = NUMBER_OF_STATES * 4 #10


### Step-by-step code

In [48]:
environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)


## *****************

In [49]:
vi_greedy_policy = train_value_iteration(gw_env=environment, verbose=True)


Value function:
[[0.       1.       0.95    ]
 [1.       0.95     0.9025  ]
 [0.95     0.9025   0.857375]]
Policy:
[['x' '<' '<']
 ['^' '<' '<']
 ['^' '<' '<']]


## *****************

In [50]:
ql_greedy_policy = train_q_learning(gw_env=environment, verbose=True)

It took 636 episodes to converge to the optimal Q-function
Q function:
[[[0.         0.         0.         0.        ]
  [0.9025     0.9025     0.95       1.        ]
  [0.857375   0.9025     0.9025     0.95      ]]

 [[0.9025     0.9025     1.         0.95      ]
  [0.857375   0.857375   0.95       0.95      ]
  [0.81450625 0.857375   0.9025     0.9025    ]]

 [[0.9025     0.857375   0.95       0.9025    ]
  [0.857375   0.81450625 0.9025     0.9025    ]
  [0.81450625 0.81450625 0.857375   0.857375  ]]]
Policy:
[['x' '<' '<']
 ['^' '^' '^']
 ['^' '^' '^']]


In [51]:
# Code can be uncommented for testing multiple iterations of Q-Learning

#counter = 0
#for _ in range(100):
#    ql_greedy_policy = train_q_learning(gw_env=environment, verbose=False)
#    if ql_greedy_policy != optimal_policy:
        #print("ql_greedy_policy \n", ql_greedy_policy)
        #print("optimal_policy \n", optimal_policy)
        #environment.display_policy(policy=ql_greedy_policy)
        #print("-----------------------------")
#        counter += 1
#print(f"policy was wrong {counter} times." )
        

In [52]:
print(f"Generating {NUMBER_OF_TRAJECTORIES} trajectories...")

greedy_policy = vi_greedy_policy
# greedy_policy = ql_greedy_policy

trajectories = environment.generate_trajectories(policy=greedy_policy,
                                                 n_traj=NUMBER_OF_TRAJECTORIES,
                                                 max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)


Generating 180 trajectories...


#### Generate data for IRL

In [None]:
print("IRL from samples...")

# Init configuration

GAMMA = 0.95
VALUE_ITERATION_TRAINING_N = 1000
IRL_TRAINING_N = 2

#GW_SIZE = (3, 3)
GW_SIZES = [(3, 3), (4,5), (7,7), (10,10)]  # [(x, x) for x in np.arange(5,11, 5)]
GW_TRAPS = []
GW_GOALS = [(0, 0)]

for GW_SIZE in GW_SIZES:

    NUMBER_OF_STATES = GW_SIZE[0] * GW_SIZE[1]

    NUMBER_OF_TRAJECTORIES = NUMBER_OF_STATES * 20 #400
    MAXIMUM_TRAJECTORY_LENGTH = NUMBER_OF_STATES * 4 #10


    # restart the environment
    environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)

    target_reward = deepcopy(environment.get_board())

    vi_greedy_policy = train_value_iteration(gw_env=environment, verbose=False)
    vi_greedy_value_function = perform_value_evaluation(gw_env=environment, policy=vi_greedy_policy, verbose=False)

    # train_func = train_value_iteration
    train_func = train_q_learning

    estimated_rewards = irl_reward_estimation(env=environment, optimal_trajectories=trajectories, train_func=train_func)


    vi_predicted_val_func = perform_value_evaluation(gw_env=estimated_rewards['environment'], policy=estimated_rewards['policy_pred'], verbose=False)


    #ref_reward_funcs.append(estimated_rewards['reference_reward_func'])
    #avg_pred_reward_funcs.append(estimated_rewards['avg_predicted_reward_func'])
    # Using default value for reward loss -> Frobenius for matrices and L2-loss for vectors

    #np.linalg.norm()
    #reward_loss.append(np.linalg.norm(estimated_rewards['reference_reward_func'] - estimated_rewards['avg_predicted_reward_func']))
    # Using L1-Loss for policy loss as described by Ng and Russel in 2000
    #policy_loss.append(np.linalg.norm(estimated_rewards['policy_pred'] - np.array(list(greedy_policy.values())), ord=1 ))

    # STORING

    # add data to dictionary for storing

    dict_to_store = {
        "metadata": {
            "algorithm": "IRL - Q Learning",
            "environment": "Grid_World",
            "env_n_of_states": NUMBER_OF_STATES,
            "env_size": GW_SIZE,
            "env_traps": GW_TRAPS,
            "env_goals": GW_GOALS,
            "gamma": GAMMA,
            "expert_n_of_trajs": NUMBER_OF_TRAJECTORIES,
            "expert_max_traj_length": MAXIMUM_TRAJECTORY_LENGTH
        },
        "data": {
            "target_reward": target_reward,
            "predicted_rewards": estimated_rewards['predicted_reward_func'],
            "expert_greedy_policy": vi_greedy_policy,
            "expert_greedy_val_func" : vi_greedy_value_function,
            "predicted_policy": estimated_rewards['policy_pred'],
            "predicted_policy_val_func": vi_predicted_val_func
        }
    }


    IRL_file_suffix = "IRL_"

    # find largest file number of BIRL file in "data" folder
    largest_file_number = 0

    for file in os.listdir("data/IRL"):
        if IRL_file_suffix in file:
            after_suffix = (file[len(IRL_file_suffix):])
            file_number = int(after_suffix.split(".")[0])
            if file_number > largest_file_number:
                largest_file_number = file_number

    print(f"Found largest file number of IRL data file: {largest_file_number}")


    print(f"Writing next IRL data file with number: {largest_file_number + 1}")
    # write dict to next file
    with open(os.path.join("data/IRL", f"{IRL_file_suffix}{largest_file_number + 1}.pkl"), "wb") as file:
        pickle.dump(dict_to_store, file)



IRL from samples...
Reference reward function:
 [[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Computing value estimates for optimal trajectories...


In [59]:
with open(os.path.join("data/IRL", f"{IRL_file_suffix}{7}.pkl"), 'rb') as file:
    
    loaded_dict = pickle.load(file)

    print(loaded_dict)

{'metadata': {'algorithm': 'IRL - Q Learning', 'environment': 'Grid_World', 'env_n_of_states': 4, 'env_size': (2, 2), 'env_traps': [], 'env_goals': [(0, 0)], 'gamma': 0.95, 'expert_n_of_trajs': 80, 'expert_max_traj_length': 16}, 'data': {'target_reward': array([[1., 0.],
       [0., 0.]]), 'predicted_rewards': array([[-1.34821153, -1.44839011],
       [-1.44839011, -1.55262414]]), 'expert_greedy_policy': {(0, 0): (1, 0), (0, 1): (0, -1), (1, 0): (-1, 0), (1, 1): (0, -1)}, 'expert_greedy_val_func': {(0, 0): 0.0, (0, 1): 1.0, (1, 0): 1.0, (1, 1): 0.95}, 'predicted_policy': {(0, 0): (1, 0), (0, 1): (0, -1), (1, 0): (-1, 0), (1, 1): (-1, 0)}, 'predicted_policy_val_func': {(0, 0): 0.0, (0, 1): -1.3482115255684346, (1, 0): -1.3482115255684346, (1, 1): -2.7291910562390473}}}


#### Original IRL

In [10]:
print("IRL from samples...")

# restart the environment
environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)

train_func = train_value_iteration
# train_func = train_q_learning

estimated_rewards = irl_reward_estimation(env=environment, optimal_trajectories=trajectories, train_func=train_func)
ref_reward_funcs.append(estimated_rewards['reference_reward_func'])
avg_pred_reward_funcs.append(estimated_rewards['avg_predicted_reward_func'])
# Using default value for reward loss -> Frobenius for matrices and L2-loss for vectors
reward_loss.append(np.linalg.norm(estimated_rewards['reference_reward_func'] - estimated_rewards['avg_predicted_reward_func']))
# Using L1-Loss for policy loss as described by Ng and Russel in 2000
policy_loss.append(np.linalg.norm(estimated_rewards['policy_pred'] - np.array(list(greedy_policy.values())), ord=1 ))

print('**********************************************')
print('*****************REWARD LOSS******************')
print(reward_loss)
print('**********************************************')
print('*****************POLICY LOSS*************************')
print(policy_loss)
print('**********************************************')


IRL from samples...
Reference reward function:
 [[1. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
Computing value estimates for optimal trajectories...
Iteration 0...
Generating trajectories for the candidate policy...
Computing value estimates for condidate trajectories...


KeyboardInterrupt: 

### End-to-end loop

In [None]:
vi = True
ql = False
gt = True
irl = True
plt = False

print("configuration is:")
if vi:
    print("training: value iteration")
if ql:
    print("training: Q-Learning")
if gt:
    print("generating trajectories")
if irl:
    print("inverse reinforcment learning")
if plt:
    print("creating plots")
    
print("")

ref_reward_funcs = []
avg_pred_reward_funcs = []
reward_loss = []
policy_loss = []

for GW_SIZE in GW_SIZES:
    environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)

    train_func = train_value_iteration

    if vi:
        print("Training via value iteration...")
        greedy_policy = train_value_iteration(gw_env=environment, verbose=True)
    elif ql:
        print("Training via q-learning...")
        greedy_policy = train_q_learning(gw_env=environment, verbose=True)
        train_func = train_q_learning
    else:
        # load from file (?)
        greedy_policy = {}

    if gt:
        print(f"Generating {NUMBER_OF_TRAJECTORIES} trajectories...")
        trajectories = environment.generate_trajectories(policy=greedy_policy,
                                                         n_traj=NUMBER_OF_TRAJECTORIES,
                                                         max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)

    if irl:
        print("IRL from samples...")
        estimated_rewards = irl_reward_estimation(env=environment, optimal_trajectories=trajectories, train_func=train_func)
        ref_reward_funcs.append(estimated_rewards['reference_reward_func'])
        avg_pred_reward_funcs.append(estimated_rewards['avg_predicted_reward_func'])
        # Using default value for reward loss -> Frobenius for matrices and L2-loss for vectors
        reward_loss.append(np.linalg.norm(estimated_rewards['reference_reward_func'] - estimated_rewards['avg_predicted_reward_func']))
        # Using L1-Loss for policy loss as described by Ng and Russel in 2000
        policy_loss.append(np.linalg.norm(estimated_rewards['policy_pred'] - np.array(list(greedy_policy.values())), ord=1 ))

        print('**********************************************')
        print('*****************REWARD LOSS******************')
        print(reward_loss)
        print('**********************************************')
        print('*****************POLICY LOSS*************************')
        print(policy_loss)
        print('**********************************************')

print('reward_loss \n', reward_loss)
plt.plot(reward_loss)
plt.savefig('reward_loss.png')

print("Closing up the arena...")