## Inverse Reinforcement Learning for Financial Cliff Walking


This notebook contains implementations of three IRL algorithms for the Financial Cliff Walking (FCW) problem:

Max Causal Entropy IRL

IRL from Failure (IRLF)

T-REX

In [28]:
import matplotlib.pyplot as plt 
%matplotlib inline

import numpy as np
import time

import warnings
warnings.filterwarnings('ignore')

In [30]:
# Global variables
# N - World height
# T - World width
WORLD_HEIGHT = 4
WORLD_WIDTH = 12

# Probability for exploration - epsilon
EPSILON = 0.1
# Step size
ALPHA = 0.001
# Gamma - discount factor - for Q-Learning, Sarsa and Expected Sarsa
GAMMA = 0.9

# Actions - ACTION_UP is a+ (adding a deposit), ACTION_DOWN is a-(redeeming a deposit) and 
# ACTION_ZERO is a0 (leaving the account as it is).
ACTION_UP = 0
ACTION_DOWN = 1
ACTION_ZERO = 2
ACTIONS = [ACTION_UP, ACTION_DOWN, ACTION_ZERO]

# Initial and Goal states
START = [1,0]
GOAL = [0, WORLD_WIDTH-1]

### Functions determining the step

In [33]:
# Step function that describes how the next state is obtained from the current state and the action 
# taken. The function returns the next state and the reward obtained.
def step(state, action):
    i, j = state

    if state[0] == 0 and (state[1] > 0): #  and state[1] < WORLD_WIDTH - 2):
        # remain in the bankruptcy state
        next_state =  [0, min(j + 1, WORLD_WIDTH - 1)]
        reward = 0 
        return next_state, reward
    
    # if at the final time, next state is the same, and reward is zero
    if state[1] == WORLD_WIDTH - 1:
        next_state = [i,state[1]]
        reward = 0
        return next_state, reward
    
    if action == ACTION_UP:
        next_state = [min(i + 1, WORLD_HEIGHT-1), min(j + 1, WORLD_WIDTH - 1)]
    elif action == ACTION_DOWN:
        next_state = [max(i - 1, 0), min(j + 1, WORLD_WIDTH - 1)]
    elif action == ACTION_ZERO:
        next_state = [i, min(j + 1, WORLD_WIDTH - 1)]
    else:
        assert False
    
    # The reward is -1 for actions ACTION_UP and ACTION_DOWN. This is done to keep transactions to a minimum.
    reward = -1
    
    # ACTION_ZERO gets a zero reward since we want to minimize the number of transactions
    if action == ACTION_ZERO:
        reward = 0
    
    # Exceptions are 
    # i) If bankruptcy happens before WORLD_WIDTH time steps
    # ii) No deposit at initial state
    # iii) Redemption at initial state!
    # iv) Any action carried out from a bankrupt state
    if ((action == ACTION_DOWN and i == 1 and 1 <= j < 10) or (
        action == ACTION_ZERO and state == START) or (
        action == ACTION_DOWN and state == START )) or (
        i == 0 and 1 <= j <= 10):    
            reward = -100
        
    # Next exception is when we get to the final time step.
    if state[0] != 0 and (next_state[1] == WORLD_WIDTH - 1): 
        # override a random action by a deterministic action=ACTION_DOWN
        if (next_state[0] == 0): # Action resulted in ending with zero balance in final time step
            reward = 10
        else:
            reward = -10   
        
    return next_state, reward

# Choose an action based on epsilon greedy algorithm
def choose_action(state, q_value, eps=EPSILON):
    if np.random.binomial(1, eps) == 1:
        action = np.random.choice(ACTIONS)
    else:
        values_ = q_value[state[0], state[1], :]
        action = np.random.choice([action_ for action_, value_ in enumerate(values_) 
                                 if value_ == np.max(values_)])
    # From bankrupt state there is no meaningful action, so we will assign 'Z' by convention.
    if state[0] == 0 and state[1] > 0:
        action = ACTION_ZERO
    return action