In [1]:
import numpy as np

In [2]:
class GridWorld:
    
    def __init__(self, rows, cols, start, goal):
        
        self.env = np.zeros((rows,cols))
        
        self.rows = rows
        
        self.cols = cols
        
        self.start = start 
        
        self.goal = goal
        
        self.env[goal] = 100
        
        self.weights = np.random.uniform(1,100,2)
        
        # return a 2d array for transpose
        self.feature = lambda s,a: np.array([self.nextState(s,a)])
        
        self.qvalue = lambda s,a: ((self.feature(s,a))).dot(self.weights.T)
        
        self.beta = np.random.uniform(0,1)
        
    def reward(self, state, action):
        
        if(action == 0):
            
            if(state[0] == 0):
                return self.env[state[0], state[1]] - 1
            
            return self.env[state[0] - 1, state[1]]
        
        elif(action == 1):
            
            if(state[0] == self.cols - 1):
                return self.env[state[0],state[1]] - 1 
            
            return self.env[state[0] + 1, state[1]]
        
        elif(action == 2):
            
            if(state[1] == 0):
                return self.env[state[0],state[1]] - 1
            
            return self.env[state[0], state[1] - 1]
        
        elif(action == 3):
        
            if(state[1] == self.rows - 1):
                return self.env[state[0], state[1]] - 1
        
            return self.env[state[0], state[1] + 1]
        
        elif(action == 4):
            return self.env[state[0], state[1]]
        
    def nextState(self, state, action):
        
        if(action == 0):
            
            if(state[0] == 0):
                return [state[0], state[1]]
            
            return [state[0] - 1, state[1]]
        
        elif(action == 1):
            
            if(state[0] == self.cols - 1):
                return [state[0],state[1]]
            
            return [state[0] + 1, state[1]]
        
        elif(action == 2):
            
            if(state[1] == 0):
                return [state[0],state[1]]
            
            return [state[0], state[1] - 1]
        
        elif(action == 3):
        
            if(state[1] == self.rows - 1):
                return [state[0], state[1]]
        
            return [state[0], state[1] + 1]
        
        elif(action == 4):
            return [state[0], state[1]]
        

In [3]:
class Agent:
    
    def __init__(self, rows, cols, start):
        
        self.parameter = np.random.uniform(1,100,2)
        
        self.rows = rows
        
        self.cols = cols
        
        self.actions = ['up','down','left','right','stay']
        
        self.feature = lambda s,a: np.array([self.nextState(s,a)])    
               
        self.sample_action = lambda s: np.random.choice(self.actions,p=self.policy_model(s))
       
        self.gamma = np.random.uniform(0,1)
        
        self.alpha = np.random.uniform(0,1)
        
        self.start = start
        
        
    def policy_model(self, s):
        
        probs = np.array([])
        
        for i in range(len(self.actions)):
            
            probs = np.hstack((probs, np.exp(((self.feature(s,i))).dot(self.parameter.T))[0]))
            
            
        probs = probs/np.sum(probs, axis=0)
        
        
        return probs
            
        
    def score_fn(self, state, action):
            
        avg = np.zeros((1,2))    
        
        probs = self.policy_model(state)
        
        for i in range(len(probs)):
                
                avg += self.feature(state,i) * probs[i]
        
        return self.feature(state,action) - avg
    
    
    def nextState(self, state, action):
        
        if(action == 0):
            
            if(state[0] == 0):
                return [state[0], state[1]]
            
            return [state[0] - 1, state[1]]
        
        elif(action == 1):
            
            if(state[0] == self.cols - 1):
                return [state[0],state[1]]
            
            return [state[0] + 1, state[1]]
        
        elif(action == 2):
            
            if(state[1] == 0):
                return [state[0],state[1]]
            
            return [state[0], state[1] - 1]
        
        elif(action == 3):
        
            if(state[1] == self.rows - 1):
                return [state[0], state[1]]
        
            return [state[0], state[1] + 1]
        
        elif(action == 4):
            return [state[0], state[1]]

        
        

In [8]:
environment = GridWorld(5,5,(0,0),(4,4))

In [9]:
agent = Agent(5,5,(0,0))

In [10]:
def QAC():
    
    s = environment.start
    
    a = agent.sample_action(s)
    
    print(a)
    
    a = agent.actions.index(a)
    
    print(a)
    
    count = 0
    
#     for i in range(1000):
        
#         count = 0
#         s = environment.start
    
    while s[0] != 4 or s[1] != 4:

#         if(count == 100):
#             break

        r = environment.reward(s,a)

#         print(r)

        s1 = environment.nextState(s, a)

        print(s1)

        a1 = agent.sample_action(s1)

        a1 = agent.actions.index(a1)

#             print(a1)

        td_error = r + (agent.gamma * (environment.qvalue(s1,a1))) - environment.qvalue(s,a)


#         print(td_error)

        agent.parameter += agent.alpha * (agent.score_fn(s,a)[0]) * (environment.qvalue(s,a)[0])

#         print(agent.parameter)

        environment.weights += environment.beta * (td_error) * (environment.feature(s,a)[0])

#         print(environment.weights)

        a = a1

        s = s1

#         count += 1
        

In [11]:
QAC()

down
1
[1, 0]
[2, 0]
[3, 0]
[4, 0]
[4, 1]
[4, 2]
[4, 3]
[4, 4]


In [14]:
final_policy()

[[1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1.]
 [3. 3. 3. 3. 1.]]


In [13]:
def final_policy():
    
    out = np.zeros((environment.rows, environment.cols))

    for i in range(environment.rows):
        
        for j in range(environment.cols):

#             print(agent.policy_model([i,j]))
                
            out[i][j] = np.argmax(agent.policy_model([i,j]))
    
    print(out)
            

In [18]:
probs = np.array([])

for i in range(len(agent.actions)):
    
    probs = np.hstack((probs, np.exp(((agent.feature([4,0],i))).dot(agent.parameter.T))[0]))

print(probs,np.argmax(probs))

[3.52800876e+11 2.49290774e+15 2.49290774e+15 4.38479211e+17
 2.49290774e+15] 3
