In [1]:
import numpy as np
import pandas as pd
import time
from IPython.display import clear_output as cls


This demo program uses a simple example (a 1-D world):

S--------O

where 'S' is start position; 'O' is the treasure.


In [2]:
# initialize simulation parameters
N_STATES = 6                # the length of the 1-D world
ACTIONS = ['left', 'right'] # available actions
EPSILON = 0.9               # e-greedy policy
ALPHA = 0.1                 # learning rate
GAMMA = 0.9                 # discount factor
MAX_EPISODES = 20           # maximum episodes
FRESH_TIME = 0.01            # fresh time for each move

In [3]:
def build_Q_table(n_states, actions):
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))), # Q-table initial values
        columns=actions,                    # actions' name
        
    )
    #print(table) # display the Q-table
    return table

build_Q_table(N_STATES, ACTIONS)

Unnamed: 0,left,right
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
5,0.0,0.0


In [4]:
def choose_action(state, Q_table):
    state_actions = Q_table.iloc[state, :]
    if(np.random.uniform() > EPSILON or state_actions.all() == 0):
        action_name = np.random.choice(ACTIONS)
    else: # act greedy
        action_name = state_actions.argmax()
    return action_name

In [5]:
def get_env_feedback(S, A):
    """interact with the environment"""
    if A == 'right': # move right
        if S == N_STATES - 2: # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else: # move left
        R = 0
        if S == 0:
            S_ = S # reach the wall, cannot move left any futher
        else:
            S_ = S - 1
    return S_, R # new state and reward

In [6]:
def display_env(S):
    buffer = ['  ' for i in range(N_STATES-1)]
    buffer.append('🥑')
    if S == 'terminal':
        buffer[-1] = '🤞'
    else:
        buffer[S] = '🐰'
    cls()
    for i in range(len(buffer)):
        print(buffer[i], end='')
    print()

for i in range(N_STATES): 
    display_env(i)
    time.sleep(1)

          🐰


In [7]:
def rl():
    Q_table = build_Q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        S = 0
        is_terminated = False
        display_env(S)
        while not is_terminated:
            A = choose_action(S, Q_table)
            next_S, R = get_env_feedback(S, A)
            pred_Q = Q_table.loc[S,A]
            if next_S != 'terminal':
                targ_Q = R + GAMMA*Q_table.iloc[next_S, :].max() 
            else: # next state is terminal
                targ_Q = R 
                is_terminated = True
            Q_table.loc[S,A] += ALPHA*(targ_Q - pred_Q) # update Q table
            S = next_S
            display_env(S)
            print(Q_table)
            time.sleep(FRESH_TIME)
    return Q_table

In [8]:
learned_Q_table = rl()

          🤞
           left     right
0  1.121931e-06  0.038931
1  5.904900e-07  0.122046
2  9.767361e-05  0.298117
3  1.430240e-02  0.571415
4  4.818853e-02  0.878423
5  0.000000e+00  0.000000
