In [86]:
import numpy as np
from scipy.special import softmax

In [291]:
class GridWorld:
    
    def __init__(self, rows, cols, start, goal):
        
        self.env = np.zeros((rows,cols))
        
        self.rows = rows
        
        self.cols = cols
        
        self.start = start 
        
        self.goal = goal
        
        self.env[goal] = 100
        
        self.weights = np.random.normal(2,3,2)
        
        # return a 2d array for transpose
        self.feature = lambda s,a: np.array([self.nextState(s,a)])
        
        self.qvalue = lambda s,a: ((self.feature(s,a))).dot(self.weights.T)
        
        self.beta = np.random.uniform(0,1)
        
    def heuristic(self, state, action):
        
        curState = [state[0],state[1]]
        
        nextState = self.nextState(state,action)
        
        dist = np.linalg.norm(nextState - curState)
        
        
        return dist
        
    def reward(self, state, action):
        
        if(action == 0):
            
            if(state[0] == 0):
                return -1000
            
            return self.env[state[0] - 1, state[1]]
        
        elif(action == 1):
            
            if(state[0] == self.cols - 1):
                return -1000
            
            return self.env[state[0] + 1, state[1]]
        
        elif(action == 2):
            
            if(state[1] == 0):
                return -1000
            
            return self.env[state[0], state[1] - 1]
        
        elif(action == 3):
        
            if(state[1] == self.rows - 1):
                return -1000
        
            return self.env[state[0], state[1] + 1]
        
        elif(action == 4):
            return self.env[state[0], state[1]]
        
    def nextState(self, state, action):
        
        if(action == 0):
            
            if(state[0] == 0):
                return [state[0], state[1]]
            
            return [state[0] - 1, state[1]]
        
        elif(action == 1):
            
            if(state[0] == self.cols - 1):
                return [state[0],state[1]]
            
            return [state[0] + 1, state[1]]
        
        elif(action == 2):
            
            if(state[1] == 0):
                return [state[0],state[1]]
            
            return [state[0], state[1] - 1]
        
        elif(action == 3):
        
            if(state[1] == self.cols - 1):
                return [state[0], state[1]]
        
            return [state[0], state[1] + 1]
        
        elif(action == 4):
            return [state[0], state[1]]
        

In [292]:
class Agent:
    
    def __init__(self, rows, cols, start):
        
        self.parameter = np.random.normal(2,3,2)
        
        self.rows = rows
        
        self.cols = cols
        
        self.actions = ['up','down','left','right','stay']
        
        self.feature = lambda s,a: np.array([self.nextState(s,a)])    
               
        self.sample_action = lambda s: np.random.choice(self.actions,p=self.policy_model(s))
       
        self.gamma = np.random.uniform(0,1)
        
        self.alpha = np.random.uniform(0,1)
        
        self.start = start
        
        
    def heuristic(self, state, action):
        
        curState = [state[0],state[1]]
        
        nextState = self.nextState(state,action)
        
        dist = np.linalg.norm(nextState - curState)
        
        return dist
        
        
    def policy_model(self, s):
        
        probs = np.array([])
        
        for i in range(len(self.actions)):
            
            probs = np.hstack((probs, ((self.feature(s,i))).dot(self.parameter.T)[0]))
            
    
        probs -= max(probs)
        
        
        n = np.exp(probs)
        
        probs = n/sum(n)
        
        print(probs)
        
        return probs
            
        
    def score_fn(self, state, action):
            
        avg = np.zeros((1,2))    
        
        probs = self.policy_model(state)
        
        for i in range(len(probs)):
                
                avg += self.feature(state,i) * probs[i]
        
        return self.feature(state,action) - avg
    
    
    def nextState(self, state, action):
        
        if(action == 0):
            
            if(state[0] == 0):
                return [state[0], state[1]]
            
            return [state[0] - 1, state[1]]
        
        elif(action == 1):
            
            if(state[0] == self.cols - 1):
                return [state[0],state[1]]
            
            return [state[0] + 1, state[1]]
        
        elif(action == 2):
            
            if(state[1] == 0):
                return [state[0],state[1]]
            
            return [state[0], state[1] - 1]
        
        elif(action == 3):
        
            if(state[1] == self.rows - 1):
                return [state[0], state[1]]
        
            return [state[0], state[1] + 1]
        
        elif(action == 4):
            return [state[0], state[1]]

        
        

In [293]:
def QAC():
    
    s = [0,0]
    
    a = agent.sample_action(s)
    
    print(a)
    
    a = agent.actions.index(a)
    
    print(a)
    
    count = 0
    
#     for i in range(1000):
        
#         count = 0
#         s = environment.start
    
    while True:

        
        if(s[0] == 2):
            if(s[1] == 3):
                break
        
        if(count == 100):
            break

        r = environment.reward(s,a)

#         print(r)

        s1 = environment.nextState(s, a)

        print(s1)

        a1 = agent.sample_action(s1)

        print(a1)
        
        a1 = agent.actions.index(a1)

        

        td_error = r + (agent.gamma * (environment.qvalue(s1,a1))) - environment.qvalue(s,a)


#         print(td_error)

        agent.parameter += agent.alpha * (agent.score_fn(s,a)[0]) * (environment.qvalue(s,a)[0])

#         print(agent.parameter)

        environment.weights += environment.beta * (td_error) * (environment.feature(s,a)[0])

#         print(environment.weights)

        a = a1

        s = s1

        count += 1
        

In [294]:
def final_policy():
    
    out = np.zeros((environment.rows, environment.cols))

    for i in range(environment.rows):
        
        for j in range(environment.cols):

#             print(agent.policy_model([i,j]))
                
            out[i][j] = np.argmax(agent.policy_model([i,j]))
    
    print(out)
            

In [301]:
environment = GridWorld(5,5,(0,0),(2,2))

In [302]:
agent = Agent(5,5,(0,0))

In [305]:
QAC()

[1.86331030e-22 2.30441868e-94 1.86331030e-22 1.00000000e+00
 1.86331030e-22]
right
3
[0, 1]
[1.86331030e-22 2.30441868e-94 3.47192528e-44 1.00000000e+00
 1.86331030e-22]
right
[1.86331030e-22 2.30441868e-94 1.86331030e-22 1.00000000e+00
 1.86331030e-22]
[0, 2]
[1.86331030e-22 2.30441868e-94 3.47192528e-44 1.00000000e+00
 1.86331030e-22]
right
[1.86331030e-22 2.30441868e-94 3.47192528e-44 1.00000000e+00
 1.86331030e-22]
[0, 3]
[1.86331030e-22 2.30441868e-94 3.47192528e-44 1.00000000e+00
 1.86331030e-22]
right
[1.86331030e-22 2.30441868e-94 3.47192528e-44 1.00000000e+00
 1.86331030e-22]
[0, 4]
[3.33333333e-01 4.12244573e-73 6.21103434e-23 3.33333333e-01
 3.33333333e-01]
up
[1.86331030e-22 2.30441868e-94 3.47192528e-44 1.00000000e+00
 1.86331030e-22]
[0, 4]
[3.33333333e-01 4.12244573e-73 6.21103434e-23 3.33333333e-01
 3.33333333e-01]
up
[3.33333333e-01 4.12244573e-73 6.21103434e-23 3.33333333e-01
 3.33333333e-01]
[0, 4]
[3.33333333e-01 4.12244573e-73 6.21103434e-23 3.33333333e-01
 3.3333

In [306]:
final_policy()

[1.86331030e-022 6.60622402e-130 1.86331030e-022 1.00000000e+000
 1.86331030e-022]
[1.86331030e-022 6.60622402e-130 3.47192528e-044 1.00000000e+000
 1.86331030e-022]
[1.86331030e-022 6.60622402e-130 3.47192528e-044 1.00000000e+000
 1.86331030e-022]
[1.86331030e-022 6.60622402e-130 3.47192528e-044 1.00000000e+000
 1.86331030e-022]
[3.33333333e-001 1.18180781e-108 6.21103434e-023 3.33333333e-001
 3.33333333e-001]
[1.00000000e+000 1.25700274e-215 3.54542344e-108 1.90275524e-086
 3.54542344e-108]
[1.00000000e+000 1.25700274e-215 6.60622402e-130 1.90275524e-086
 3.54542344e-108]
[1.00000000e+000 1.25700274e-215 6.60622402e-130 1.90275524e-086
 3.54542344e-108]
[1.00000000e+000 1.25700274e-215 6.60622402e-130 1.90275524e-086
 3.54542344e-108]
[1.00000000e+000 1.25700274e-215 6.60622402e-130 3.54542344e-108
 3.54542344e-108]
[1.00000000e+000 1.25700274e-215 3.54542344e-108 1.90275524e-086
 3.54542344e-108]
[1.00000000e+000 1.25700274e-215 6.60622402e-130 1.90275524e-086
 3.54542344e-108]
[1.0

In [166]:
probs = np.array([])

for i in range(len(agent.actions)):
    
    probs = np.hstack((probs, np.exp(((agent.feature([4,0],i))).dot(agent.parameter.T))[0]))

print(probs,np.argmax(probs))

[ 6.28390932 11.59595565 11.59595565 19.99821729 11.59595565] 3


In [167]:
environment.env

array([[  0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0., 100.,   0.],
       [  0.,   0.,   0.,   0.,   0.],
       [  0.,   0.,   0.,   0.,   0.]])

In [290]:
np.random.normal(2,3,2)

array([ 3.55313426, -1.07871902])