# Imports

In [1]:
import cs7641assn4 as a4
import numpy as np
import pandas as pd

# Establish Environment

In [2]:
env_id = 'Deterministic-4x4-FrozenLake-v0' # string identifier for environment, arbitrary label
rH = -1 #-5 # reward for H(ole)
rG = 1 # 10 # reward for G(oal)
rF = -0.2# reward includes S(tart) and F(rozen)
size = 4 # height and width of square gridworld
p = 0.8 # if generating a random map probability that a grid will be F(rozen)
desc = None # frozen_lake.generate_random_map(size=size, p=p)
map_name = 'x'.join([str(size)]*2) # None
is_slippery = False


epsilon = 1e-8 # convergence threshold for policy/value iteration
gamma = 0.8 # discount parameter for past policy/value iterations
max_iter = 10000 # maximum iterations for slowly converging policy/value iteration 

# Qlearning(env, rH=0, rG=1, rF=0, qepsilon=0.1, lr=0.8, gamma=0.95, episodes=10000)
qepsilon = 0.1 # epsilon value for the Q-learning epsilon greedy strategy
lr = 0.8 # Q-learning rate
qgamma = 0.95 # Q-Learning discount factor
episodes = 10000 # number of Q-learning episodes
initial = 0 # value to initialize the Q grid

# Create Environment
env = a4.getEnv(env_id=env_id, rH=rH, rG=rG, rF=rF, 
                desc=desc, map_name=map_name, 
                is_slippery=is_slippery,render_initial=True)

# Store a representation of the map
env_desc = env.desc.astype('<U8')

# Store a representation of the state rewards
env_rs = a4.getStateReward(env)

# Display reward at each state
print('\n--Reward Values at Each State--')
a4.matprint(a4.print_value(env_rs,width=size,height=size))

--Board--

[41mS[0m  F  F  F
F  H  F  H
F  F  F  H
H  F  F  G

--Actions for Position to the Left of the Goal--
{0: [(1.0, 13, -0.2, False)],
 1: [(1.0, 14, -0.2, False)],
 2: [(1.0, 15, 1, True)],
 3: [(1.0, 10, -0.2, False)]}

--Reward Values at Each State--
-0.2  -0.2  -0.2  -0.2  
-0.2    -1  -0.2    -1  
-0.2  -0.2  -0.2    -1  
  -1  -0.2  -0.2     1  


# Policy Iteration

In [3]:
pi_time = %timeit -o a4.policy_iteration(env, epsilon=epsilon, gamma=gamma, max_iter=max_iter, report=False)

951 µs ± 95.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [4]:
pi_V, pi_policy, pi_epochs = a4.policy_iteration(env, epsilon=epsilon, gamma=gamma, max_iter=max_iter, report=True)

# Display values
a4.matprint(a4.print_value(pi_V))

pi_policy_arrows = a4.print_policy(pi_policy, width=size, height=size)

# Display policy
a4.matprint(pi_policy_arrows)

Policy iteration converged after  7 epochs
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    2.84    3.8      -5  
    -5     3.8      5       5  
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  


# Value Iteration

In [5]:
vi_time = %timeit -o a4.valueIteration(env, epsilon=epsilon, gamma=gamma, max_iter=max_iter, report=False)

6.38 ms ± 481 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
vi_V, vi_epochs = a4.valueIteration(env, epsilon=epsilon, gamma=gamma, max_iter=max_iter, report=True)

# display value function:
a4.matprint(a4.print_value(vi_V))

vi_policy = a4.value_to_policy(env, V=vi_V, gamma=gamma)

vi_policy_arrows = a4.print_policy(vi_policy, width=size, height=size)
# display policy
a4.matprint(vi_policy_arrows)

Value iteration converged after  91 epochs
0.9661  1.4576  2.072  1.4576  
1.4576      -5   2.84      -5  
 2.072    2.84    3.8      -5  
    -5     3.8      5       5  
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  


# Q-Learning

In [10]:
Q_time = %timeit -o a4.Qlearning(env, qepsilon, lr, qgamma, episodes)

1.43 s ± 98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
Q = a4.Qlearning(env, qepsilon, lr, qgamma, episodes)
print('--Q with all options--')
a4.matprint(Q)

maxQ = np.max(Q,axis=1)
print('\n--argmax(Q) in grid order--')
a4.matprint(a4.print_value(maxQ))

Q_policy = a4.Q_to_policy(Q)

Q_policy_arrows = a4.print_policy(Q_policy, width=size, height=size)
print('\n--Policy Matrix--')
a4.matprint(Q_policy_arrows)

--Q with all options--
0.373797  0.603997   0.603997   0.373797  
0.364615     -1.95   0.846312   0.601849  
0.603982   1.10138   0.603471   0.846216  
0.846293    -1.872  -0.738509  -0.738509  
0.603997  0.846312      -1.95   0.373797  
      -1        -1         -1         -1  
   -1.95   1.36987      -1.95   0.845892  
      -1        -1         -1         -1  
0.846312     -1.95    1.10138   0.603997  
0.846312   1.36987    1.36987      -1.95  
 1.10138    1.6525      -1.95    1.10138  
      -1        -1         -1         -1  
      -1        -1         -1         -1  
   -1.95   1.36987     1.6525    1.10138  
 1.36987    1.6525       1.95    1.36987  
       1         1          1          1  

--argmax(Q) in grid order--
 0.604  0.8463  1.1014  0.8463  
0.8463      -1  1.3699      -1  
1.1014  1.3699  1.6525      -1  
    -1  1.6525    1.95       1  

--Policy Matrix--
↓  →  ↓  ←  
↓  ←  ↓  ←  
→  ↓  ↓  ←  
←  →  →  ←  


In [8]:
Q_s, Q_steps = a4.Qlearning_trajectory(env, Q, render=False)


--Final position--
  (Right)
S  F  F  F
F  H  F  H
F  F  F  H
H  F  F  [41mG[0m
Agent ended up at state 15 after 6 steps


# Notes

Default rewards are 1 for the G(oal) and 0 for everything else.

Maps are drawn according to the following logic

```
if desc and map_name are None, 
   then a default random map is drawn with 8
        using frozen_lake.generate_random_map(size=8, p=0.8)
elif desc is None and a map_name is given
   then a map_name is either '4x4' or '8x8'
        and is drawn from the dict MAPS in frozen_lake.py
elif desc is given
   then it must be in the form of a list with 
```

Default action probabilities are 1/3 chosen action, 1/3 each for right angles to chosen action, and 0 for reverse of chosen action. This is set with `is_slippery=True`. If `is_slippery=False`, then P=1 for chosen action and 0 for all other actions.

|ACTION|Value|Symbol|
|------|-----|------|
|LEFT  | 0   | ←    |
|DOWN  | 1   | ↓    |
|RIGHT | 2   | →    |
|UP    | 3   | ↑    |

# Sources

- Code: <https://github.com/Twice22/HandsOnRL>
- Tutorial: <https://twice22.github.io/>

In [9]:
a4.policy_matrix(Q)

AttributeError: module 'cs7641assn4' has no attribute 'policy_matrix'

In [17]:
results = pd.DataFrame({'env_id': [env_id],
                        'rH': [rH], 
                        'rG': [rG], 
                        'rF': [rF], 
                        'size': [size], 
                        'p': [p], 
                        'desc': [desc], 
                        'map_name': [map_name],                        
                        'is_slippery': [is_slippery],
                        'epsilon': [epsilon],
                        'gamma': [gamma], 
                        'max_iter': [max_iter], 
                        'qepsilon': [qepsilon], 
                        'lr': [lr], 
                        'qgamma': [qgamma], 
                        'episodes': [episodes], 
                        'env_desc': [env_desc],
                        'env_rs': [env_rs],
                        'pi_time': [pi_time.average],
                        'pi_V': [pi_V],
                        'pi_epochs': [pi_epochs],
                        'pi_policy': [pi_policy],
                        'pi_policy_arrows': [pi_policy_arrows],
                        'vi_time': [vi_time.average],
                        'vi_V': [vi_V],
                        'vi_epochs': [vi_epochs],
                        'vi_policy': [vi_policy],
                        'vi_policy_arrows': [vi_policy_arrows],
                        'Q_time': [Q_time.average],
                        'Q': [Q],
                        'Q_V': [maxQ],
                        'Q_policy': [Q_policy],
                        'Q_policy_arrows': [Q_policy_arrows]})

In [18]:
display(results)

Unnamed: 0,env_id,rH,rG,rF,size,p,desc,map_name,is_slippery,epsilon,...,vi_time,vi_V,vi_epochs,vi_policy,vi_policy_arrows,Q_time,Q,Q_V,Q_policy,Q_policy_arrows
0,Deterministic-4x4-FrozenLake-v0,-1,1,-0.2,4,0.8,,4x4,False,1e-08,...,0.006382,"[0.9660799905143127, 1.4575999905143127, 2.071...",91,"[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,...",1.428201,"[[0.37379674921874956, 0.6039965781249996, 0.6...","[0.6039965781249996, 0.8463121874999997, 1.101...","[1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 0, 0, 2, 2, 0]","[[↓, →, ↓, ←], [↓, ←, ↓, ←], [→, ↓, ↓, ←], [←,..."
