In [1]:
import numpy as np
from maze import *

In [2]:
def policy_evaluation(Policy, Values, Q, env, Gamma, Theta):
    
    delta = Theta + 1
    prob = 0.9
    slip_prob = 0.1
    while True:
        delta = 0
        for s in range (States):
            v = Values[s]
            future_values = np.zeros(Actions)
            
            for a in range(4):           
                reward, next_state, done = env.step(s,a)
                future_values[a] = prob * (reward + Gamma * Values[next_state])
                
                a1 = ACTMAP[a]         

                reward_slip, next_state_slip, done = env.step(s,a1)
                future_values[a] += slip_prob * (reward_slip + Gamma * Values[next_state_slip])
                Q[s,a] = future_values[a]
                
            Values[s] = np.max(future_values)
            
            delta = max(delta, np.abs(v - Values[s]))
            
        if (delta < Theta):
            break
            
    return Values, Q
        
    

In [3]:
def policy_improvement(Policy, Values, Q, env, Gamma, Theta):
    
    policy_stable = False
    
    while (policy_stable == False):
        
        Values, Q = policy_evaluation(Policy, Values, Q, env, Gamma, Theta)
        policy_stable = True
        
        for s in range (States):
            
            old_action = Policy[s]

            Policy[s] = np.argmax(Q[s,:])

            if (old_action != Policy[s]):
                policy_stable = False
                
    return Values, Policy, Q
                
            

In [4]:
env = Maze()
States = 112
Actions = 4
Gamma = 0.9
ACTMAP = {0:3, 1:2, 2:0, 3:1}
Theta = 1e-200
Values = np.zeros(States)
Policy = np.random.randint(0, 4 , size = 112)
Q = np.zeros((States,Actions))
env.slip = 0


Values, Policy, Q = policy_improvement(Policy, Values, Q, env, Gamma, Theta)

NormVal = np.max(Q)
NormQ = Q/NormVal

#print("Q:")
#print(Q)
#print("")

print("Normilized Q:")
print(NormQ)
print("")

print("Value Function:")
print(Values)
print("")

print("Policy:")
print(Policy)
print("")

np.save('Norm_Q_Table' ,NormQ)
#np.save('Q_Table' ,Q)

Normilized Q:
[[0.15875732 0.17639702 0.15875732 0.16071728]
 [0.20644776 0.2293864  0.20644776 0.2089965 ]
 [0.25893158 0.28770175 0.25893158 0.26212826]
 [0.25209153 0.2801017  0.25209153 0.25520377]
 [0.33411207 0.37123563 0.33411207 0.33823691]
 [0.32202634 0.35780704 0.32202634 0.32600197]
 [0.38070466 0.42300518 0.38070466 0.38540472]
 [0.48254785 0.53616428 0.48254785 0.48850523]
 [0.16291922 0.17835699 0.17639702 0.19817443]
 [0.2118599  0.23193514 0.2293864  0.25770571]
 [0.2657196  0.29089844 0.28770175 0.32322049]
 [0.25870024 0.28321394 0.2801017  0.31468216]
 [0.34287099 0.37536047 0.37123563 0.41706719]
 [0.33046842 0.36178268 0.35780704 0.40198075]
 [0.39068503 0.42770524 0.42300518 0.47522804]
 [0.49519809 0.54212166 0.53616428 0.6023574 ]
 [0.20969539 0.23349159 0.23115667 0.20845403]
 [0.30515204 0.33955409 0.33615855 0.30497441]
 [0.2363742  0.23349159 0.23349159 0.2594351 ]
 [0.30804948 0.34284199 0.33941357 0.30741143]
 [0.34374612 0.33955409 0.33955409 0.37728232]

In [45]:
env = Maze()
env.slip = 0
#env.slip = 0.0
done = False
initial_state = env.reset()
state = initial_state

while (done == False): 
    action = Policy[state]
    reward, state, done = env.step(state, action)
    env.plot(state, action)
    print(state)

action:  DOWN
SWFWG
[1;32mO[0mOOOO
WOOOW
FOWFW
8
action:  RIGHT
SWFWG
O[1;32mO[0mOOO
WOOOW
FOWFW
24
action:  RIGHT
SWFWG
OO[1;32mO[0mOO
WOOOW
FOWFW
56
action:  UP
SW[1;32mF[0mWG
OOOOO
WOOOW
FOWFW
49
action:  DOWN
SWFWG
OO[1;32mO[0mOO
WOOOW
FOWFW
57
action:  RIGHT
SWFWG
OOO[1;32mO[0mO
WOOOW
FOWFW
73
action:  DOWN
SWFWG
OOOOO
WOO[1;32mO[0mW
FOWFW
81
action:  DOWN
SWFWG
OOOOO
WOOOW
FOW[1;32mF[0mW
93
action:  UP
SWFWG
OOOOO
WOO[1;32mO[0mW
FOWFW
85
action:  UP
SWFWG
OOO[1;32mO[0mO
WOOOW
FOWFW
77
action:  RIGHT
SWFWG
OOOO[1;32mO[0m
WOOOW
FOWFW
109
action:  UP
SWFW[1;32mG[0m
OOOOO
WOOOW
FOWFW
101
