In [1]:
import argparse
from typing import Any, List, Callable, Dict

from GridWorld_environments import Grid_World
from RL_agents import ValueIterationAgent, QLearningAgent
from IRL_agents import IRL_from_sampled_trajectories

from sklearn.preprocessing import MinMaxScaler

import numpy as np
import matplotlib.pyplot as plt

from copy import deepcopy

### Definitions

#### Train Value Iteration function

In [2]:
def train_value_iteration(gw_env: Grid_World, verbose=False):
    vi_agent = ValueIterationAgent(states=gw_env.get_state_space(),
                                   terminal_states=gw_env.get_terminal_states(),
                                   reward_function=gw_env.get_reward_func(),
                                   actions=gw_env.get_action_space(),
                                   gamma=GAMMA)

    iters = 0
    while iters < VALUE_ITERATION_TRAINING_N and not vi_agent.value_converged:

        for state in gw_env.get_state_space():

            if state in gw_env.get_terminal_states():
                continue

            opt_act = vi_agent.get_optimal_action(action_state_pairs=gw_env.get_action_state_pairs(state=state))
            next_state = gw_env.get_new_state_on_action(old_state=state, action=opt_act)
            next_state_value = vi_agent.get_state_value(state=next_state)

            vi_agent.set_state_value(state=state, new_value=(gw_env.get_state_reward(state=next_state) + GAMMA * next_state_value))

        iters += 1

    if verbose:
        gw_env.display_value_function(value_func=vi_agent.get_value_function())

    vi_agent.construct_greedy_policy(gw_env.get_action_state_pairs)

    if verbose:
        gw_env.display_policy(policy=vi_agent.get_policy())

    return vi_agent.get_policy()


#### Train Q Learning function

In [3]:
def train_q_learning(gw_env: Grid_World, n_episodes=5000, verbose=False, policy="eps_greedy", eps=0.2, max_episode_len=100, gamma=0.95):
    ql_agent = QLearningAgent(states=gw_env.get_state_space(),
                              size=gw_env.get_board_shape(),
                              terminal_states=gw_env.get_terminal_states(),
                              reward_function=gw_env.get_reward_func(),
                              actions=gw_env.get_action_space(),
                              gamma=gamma)
    
    # init episodes
    episodes = []
    
    # Define state_space without terminal states for getting starting position
    state_space = deepcopy(gw_env.get_state_space()) # all states
    
    board_size = gw_env.get_board_shape()
    total_states = board_size[0] * board_size[1]
    
    # Number 15 is empirically determined.
    # For a 3x3 Grid the total states are 9 and we checked, that at least 100 states are required to produce reasonably reliable results
    # So 9 * x >= 100 yields that x >= 10
    # Now we also added a buffer and therefore chose 15
    convergence_criterion = total_states * 30
    
    terminal_states = gw_env.get_terminal_states()
    for terminal_state in terminal_states:
        state_space.remove(terminal_state) # not non absorbing state_space
    
    # init state_visited_counter
    state_visited = {state: 4 for state in state_space}
    
    #action_value_converged = False
    convergence_counter = 0
    
    for n in range(n_episodes):
        
        episode = []
        
        # reset if every state has been visited at least 4 times (for each action)
        if ( (np.array(list(state_visited.values())) <= 0).all() ):
            state_visited = {state: 4 for state in state_space}
        
        # random starting position
        states_not_visited = [ state for state in state_visited if state_visited[state] > 0 ]
        if len(states_not_visited) > 0:
            start_idx = (np.random.choice(len(states_not_visited)))
            start = states_not_visited[start_idx]
        else:
            start_idx = (np.random.choice(len(state_space)))
            start = state_space[start_idx]
        
        state_visited[start] -= 1
        
        episode.append(start)
        
        i = 0
        terminal = False
        
        old_q_val_func = ql_agent.get_Q_function(mat_repr=True)
        
        while ( ( i < max_episode_len ) and ( not terminal ) ):
            i += 1
            
            # Choose Action from S derived by given policy
            if policy == "eps_greedy":
                if np.random.uniform() < (1-eps):
                    # Choose greedy action -> highest Q-Value
                    chosen_action = ql_agent.get_greedy_action(episode[-1])
                else:
                    # Choose random action form action space
                    action_space = gw_env.get_action_space()
                    chosen_action = action_space[np.random.choice(len(action_space))]
            
            new_state = gw_env.get_new_state_on_action(episode[-1], chosen_action)
            
            # Reward is taken from Q_learning agent -> it knows the reward function from the environment
            ql_agent.update_Q_value(episode[-1], new_state, chosen_action)
            
            episode.append(new_state)
            
            if new_state in terminal_states:
                terminal = True
            else:
                # add to state visited counter for the new state if it is not terminal
                state_visited[new_state] -= 1
                #if (state_visited[new_state] >= 5):
                    #state_visited[new_state] = 0
                    
        episodes.append(episode)
                    
        # essentially works nicely, but to be used carefully. States that will not be visited by the current policy
        # will only be visited, when the start is chosen by random choice in this state
        # -> Fixed by: rarely visited states will be preferred for the choice of the start
        
        # Check if Q-function did is close to the Q-function from the last episode
        if np.isclose( old_q_val_func, ql_agent.get_Q_function(mat_repr=True), atol=1e-08 ).all( ):
            convergence_counter += 1
            
            # Comment in print statements to see how the episodes develop until convergence
            #print("--------------")
            #print(f"episode {n}")
            #print("convergence_counter", convergence_counter)
            #print("--------------")
            
            if convergence_counter >= convergence_criterion:
                break
        else:
            convergence_counter = 0

        
        
    if verbose:
        if n < n_episodes:
            print(f"It took {n} episodes to converge to the optimal Q-function")
        else:
            print(f"Did not converge to optimal Q-function in {n_episodes} episodes")
    
    if verbose:
        gw_env.display_q_function(q_func=ql_agent.get_Q_function())

    ql_agent.construct_greedy_policy(gw_env.get_action_state_pairs)

    if verbose:
        gw_env.display_policy(policy=ql_agent.get_policy())

    return ql_agent.get_policy()       


#### Perform Action-Value evaluation function (Q-Learning evaluation)

In [4]:
def perform_value_evaluation(gw_env: Grid_World, policy: Dict[Any, Any], verbose=False):
    
    vi_agent = ValueIterationAgent(states=gw_env.get_state_space(),
                                   terminal_states=gw_env.get_terminal_states(),
                                   reward_function=gw_env.get_reward_func(),
                                   actions=gw_env.get_action_space(),
                                   gamma=GAMMA)
    
    while not vi_agent.value_converged:
        
        for state in gw_env.get_state_space():
            
            if state in gw_env.get_terminal_states():
                continue
                
            policy_act = policy[state]
            next_state = gw_env.get_new_state_on_action(old_state=state, action=policy_act)
            next_state_value = vi_agent.get_state_value(state=next_state)
            
            vi_agent.set_state_value(state=state, new_value=(gw_env.get_state_reward(state=next_state) + GAMMA * next_state_value))
        
    if verbose:
        gw_env.display_value_function(value_func=vi_agent.get_value_function())

    return vi_agent.get_value_function()
            

#### IRL Reward estimation function

In [5]:
def irl_reward_estimation(env: Grid_World, optimal_trajectories: List[List[Any]], train_func: Callable):

    # store reference reward function
    reward_func_ref = deepcopy(env.get_board())
    print('Reference reward function:\n', reward_func_ref)

    irl_agent = IRL_from_sampled_trajectories(d=(GW_SIZE[0] * 4, GW_SIZE[1] * 4),
                                              env_ranges=((0, GW_SIZE[0]), (0, GW_SIZE[1])),
                                              env_discrete_size=GW_SIZE,
                                              penalty_factor=2,
                                              gamma=GAMMA)

    # step 2: given optimal trajectories, compute the value estimate
    print("Computing value estimates for optimal trajectories...")
    optimal_value_estimate = irl_agent.compute_value_estimate(trajs=optimal_trajectories)

    candidate_policies = [env.construct_random_policy()]
    candidate_value_estimates = []
    reward_func_estimates = []

    # while True:
    for i in range(IRL_TRAINING_N):
        print(f"Iteration {i}...")

        # step 3: generate trajectories and compute the value estimate for a random policy
        print("Generating trajectories for the candidate policy...")
        candidate_trajectories = env.generate_trajectories(policy=candidate_policies[-1],
                                                           n_traj=NUMBER_OF_TRAJECTORIES,
                                                           max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)
        print("Computing value estimates for condidate trajectories...")
        candidate_value_estimates.append(irl_agent.compute_value_estimate(trajs=candidate_trajectories))

        # step 4: obtain new alphas
        print("Solving linear programming...")
        irl_agent.solve_lp(optimal_value_estimate, candidate_value_estimates)

        # step 5: construct new reward function from the alphas
        reward_func = irl_agent.construct_reward_function(alphas=irl_agent.get_alphas())

        # step 6: find optimal policy under new reward function and add to 'candidate_policies' list
        env.set_reward_func(reward_func)
        candidate_policies.append(train_func(gw_env=env, verbose=True))  # train_value_iteration(gw_env=env))
        # store new reward function
        reward_func_estimates.append(env.get_board())
        
        print("Latest estimated reward function:\n", reward_func_estimates[-1])
        env.display_policy(policy=candidate_policies[-1])
        print("============================================================\n" * 2)

    return {'reference_reward_func': reward_func_ref, 'policy_pred': np.mean(np.array([list(pol.values()) for pol in candidate_policies]), axis=0), 'avg_predicted_reward_func': np.mean(np.array(reward_func_estimates), axis=0)}


def calc_value_distance(value_estimates_ref, value_estimates_pred):
    return np.linalg.norm(np.array(value_estimates_ref) - np.array(value_estimates_pred))

# Bayesian Inverse Reinforcement Learning

## Agenda

1. Motivation
2. Reference work
    1. Inverse Reinforcement Learning (IRL)
    2. Bayesian Inverse Reinforcement Learning (BIRL)
3. Implementation
4. Results
5. Outlook
    - Add Priors
        - Gaussian, ...
    - Add different Policies
        - Boltzman
        - Epsilon Greedy
        - Greedy

## 1. Motivation

The problem of Inverse Reinforcement Learning has been defined in Learning agents for uncertain environments (Russel, 1998).

- "**Determine**: reward function being optimized.
- **Given**: 1) Measurements of an agent's behavior over time, in a variety of circumstances 2) Measurements of the sensory inputs to that agent; 3) a model of the environment." ( Ng and Russel, 2000 )

Reproduce the results of the case for Bayesian Inverse Reinforcement Learning.

motivation for this problem arises from e.g. animal and human learning.
example: 
Bee foraging -> 
bee searches for nectar. 
reward -> nectar in flower
possibly multiattribute: weight nectar ingestion against flight distance, time, risk from wind and predators.

Conclusion: hard to determine relative weights here

### The Bee hive
<img src="presentation/Bee_Hive_IRL.png" alt="bee hive" width="100%"/>
source: self made

The problem here: How do we actually properly set rewards for different attributes?

#### Second idea: 
Often we want to learn a policy based on observation of an expert. However it is assumed that the reward function is known, which often is not the case. So we take the policy from the expert as given and infer the reward function.

### Hyper parameters

In [6]:
GAMMA = 0.95
VALUE_ITERATION_TRAINING_N = 25
IRL_TRAINING_N = 10

NUMBER_OF_TRAJECTORIES = 400
MAXIMUM_TRAJECTORY_LENGTH = 10

GW_SIZE = (6, 6)
GW_SIZES = [(3, 3)]  # [(x, x) for x in np.arange(5,11, 5)]
GW_TRAPS = [(3,3)]
GW_GOALS = [(0, 0)]

In [7]:
environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)

In [8]:
vi_greedy_policy = train_value_iteration(gw_env=environment, verbose=True)

Value function:
[[0.         1.         0.95       0.9025     0.857375   0.81450625]
 [1.         0.95       0.9025     0.857375   0.81450625 0.77378094]
 [0.95       0.9025     0.857375   0.81450625 0.77378094 0.73509189]
 [0.9025     0.857375   0.81450625 0.         0.73509189 0.6983373 ]
 [0.857375   0.81450625 0.77378094 0.73509189 0.6983373  0.66342043]
 [0.81450625 0.77378094 0.73509189 0.6983373  0.66342043 0.63024941]]
Policy:
[['x' '<' '<' '<' '<' '<']
 ['^' '<' '<' '<' '<' '<']
 ['^' '<' '<' '<' '<' '<']
 ['^' '<' '<' 'x' '^' '<']
 ['^' '<' '<' '<' '<' '<']
 ['^' '<' '<' '<' '<' '<']]


## 1. Inverse Reinforcement Learning (IRL)

Given 
- finite State space $\mathcal{S}$
- a set of $k$ actions $A = \{a_1, \cdots , a_k\}$
- transition probabilities $\{P_{sa}\}$
- discount factor $\gamma$ 
- policy $\pi$ in Markov Decision Process $(S, A, \{P_{sa}\}, \gamma, R)$

Constraint:
- only finite-state MDPs

Required "tools"
- value iteration
- q learning
-

### Linear function approximation

- For a general solution of $R$ we require all function to map $\mathbb{R}^n \rightarrow \mathbb{R}$
    - Hard to work with this, therefore we choose a linear approximation
    
$ \quad R(s) = \alpha_1 \phi_1(s) + \alpha_2 \phi_2(s) + \cdots + \alpha_d \phi_d(s) $

with the $\phi$s being fixed basis functions mapping from state $S$ into $R$ and the $\alpha_i$s a are the parameters that can be fit and optimized for the reward function.

Using linearity of expectations value function $V_i^\pi$ for the policy $\pi$ is given by

$ \quad V^\pi = \alpha_1 V_1^\pi + \cdots + \alpha_d V_d^\pi$

### IRL from Sampled Trajectories

- Access to policy $\pi$ only given by a set of trajectories.
- Assume that optimal policy can be found by choosing a reward function
- No explicit model of MDP required


The idea:
- sample trajectories
<img src="presentation/IRL_trajectories.png" width="50%" height="50%"/>

execute $m$ Monte Carlo trajectories under $\pi$
Then for each $i = 1, \cdots , d$ define $V_i^\pi(s_0)$ to be what the average empirical return would have been on these on these $m$ trajectories if the reward $R$ had been $\phi_i$.
For any setting of $\alpha_i$s the natural estimate of $\hat{V}^\pi(s_0)$ is:

$\hat{V}^\pi(s_0) = \alpha_1 $

In [None]:
print("IRL from samples...")

# restart the environment
environment = Grid_World(size=GW_SIZE, traps=GW_TRAPS, goals=GW_GOALS, randomize_board=False)

train_func = train_value_iteration
# train_func = train_q_learning

estimated_rewards = irl_reward_estimation(env=environment, optimal_trajectories=trajectories, train_func=train_func)

# ----
#def irl_reward_estimation(env: Grid_World, optimal_trajectories: List[List[Any]], train_func: Callable):

# store reference reward function
reward_func_ref = deepcopy(env.get_board())
print('Reference reward function:\n', reward_func_ref)

irl_agent = IRL_from_sampled_trajectories(d=(GW_SIZE[0] * 4, GW_SIZE[1] * 4),
                                          env_ranges=((0, GW_SIZE[0]), (0, GW_SIZE[1])),
                                          env_discrete_size=GW_SIZE,
                                          penalty_factor=2,
                                          gamma=GAMMA)

# step 2: given optimal trajectories, compute the value estimate
print("Computing value estimates for optimal trajectories...")
optimal_value_estimate = irl_agent.compute_value_estimate(trajs=optimal_trajectories)

candidate_policies = [env.construct_random_policy()]
candidate_value_estimates = []
reward_func_estimates = []

# while True:
for i in range(IRL_TRAINING_N):
    print(f"Iteration {i}...")

    # step 3: generate trajectories and compute the value estimate for a random policy
    print("Generating trajectories for the candidate policy...")
    candidate_trajectories = env.generate_trajectories(policy=candidate_policies[-1],
                                                       n_traj=NUMBER_OF_TRAJECTORIES,
                                                       max_traj_length=MAXIMUM_TRAJECTORY_LENGTH)
    print("Computing value estimates for condidate trajectories...")
    candidate_value_estimates.append(irl_agent.compute_value_estimate(trajs=candidate_trajectories))

    # step 4: obtain new alphas
    print("Solving linear programming...")
    irl_agent.solve_lp(optimal_value_estimate, candidate_value_estimates)

    # step 5: construct new reward function from the alphas
    reward_func = irl_agent.construct_reward_function(alphas=irl_agent.get_alphas())

    # step 6: find optimal policy under new reward function and add to 'candidate_policies' list
    env.set_reward_func(reward_func)
    candidate_policies.append(train_func(gw_env=env, verbose=True))  # train_value_iteration(gw_env=env))
    # store new reward function
    reward_func_estimates.append(env.get_board())

    print("Latest estimated reward function:\n", reward_func_estimates[-1])
    env.display_policy(policy=candidate_policies[-1])
    print("============================================================\n" * 2)

#return {'reference_reward_func': reward_func_ref, 'policy_pred': np.mean(np.array([list(pol.values()) for pol in candidate_policies]), axis=0), 'avg_predicted_reward_func': np.mean(np.array(reward_func_estimates), axis=0)}