In [50]:
import numpy as np
import random
import copy

In [7]:
# Directions
#0 is left
#1 is down
#2 is right
#3 is Up

# 1.1 displays your position

In [8]:


class myGridWorld:
    
    size=5
    RewardGrid=np.zeros([5,5])
    RewardGrid[0][4]=1
    PositionGrid=np.zeros([5,5])
    PositionGrid[4][0]=1.1
    action_space=4
    noisyMoveChance=0.3
    currI=4
    currJ=0
    DoneStatus=False
    EnableNoise=True
    observation_spaces=size*size
    
    def __init__(self,size=5,noisyMoveChance=0.3,EnableNoise=True):
        self.basicReset()
        self.EnableNoise=EnableNoise
        if(0<size):
            self.size=int(size)
            self.RewardGrid=np.zeros([size,size])
            self.RewardGrid[0][size-1]=1
            self.PositionGrid=np.zeros([size,size])
            self.PositionGrid[size-1][0]=1.1
            self.observation_spaces=self.size*self.size
            self.currI=size-1
            self.currJ=0
            self.observation_spaces=self.size*self.size
        if(0<noisyMoveChance and noisyMoveChance<1):
            self.noisyMoveChance=noisyMoveChance
            
    def basicReset(self):
        self.size=5
        self.RewardGrid=np.zeros([5,5])
        self.RewardGrid[0][4]=1
        self.PositionGrid=np.zeros([5,5])
        self.PositionGrid[4][0]=1.1
        self.action_space=4
        self.noisyMoveChance=0.3
        self.currI=4
        self.currJ=0
        self.DoneStatus=False
        self.EnableNoise=True
        self.observation_spaces=self.size*self.size
            
    def reset(self,size=5,noisyMoveChance=0.3,EnableNoise=True):
        self.__init__(size,noisyMoveChance,EnableNoise)
        return self.currI*self.size+self.currJ
    
    def printRewardGrid(self):
        for i in range(len(self.RewardGrid)):
            for j in range(len(self.RewardGrid[0])):
                print(self.RewardGrid[i][j],end=' ')
            print()
            
    def printPositionGrid(self):
        for i in range(len(self.PositionGrid)):
            for j in range(len(self.PositionGrid[0])):
                print(self.PositionGrid[i][j],end=' ')
            print()
            
    def getPositionGrid(self):
        return self.PositionGrid
            
    def render(self):
        self.printPositionGrid()
        
    def getAvailableMoves(self):
        return self.action_space
    
    def getSize(self):
        return self.size
            
    def move(self,action):
        randNum=random.random()
        if(self.EnableNoise and randNum<=self.noisyMoveChance):
            self.makeNoisyMove(action)
        else:
            self.makeProperMove(action)
        return self.currI,self.currJ,self.currI*self.size+self.currJ,self.RewardGrid[self.currI][self.currJ],self.DoneStatus
        
    def makeNoisyMove(self,action):
        randNum=random.randint(0,3)
        self.makeProperMove(randNum)
        
    def makeProperMove(self,action):
        if(action==0):#Left
            if(0<self.currJ):
                self.PositionGrid[self.currI][self.currJ]=0
                self.currJ-=1
                self.PositionGrid[self.currI][self.currJ]=1.1
        elif(action==1):#1 is down
            if(self.currI<self.size-1):
                self.PositionGrid[self.currI][self.currJ]=0
                self.currI+=1
                self.PositionGrid[self.currI][self.currJ]=1.1
        elif(action==2):#2 is right
            if(self.currJ<self.size-1):
                self.PositionGrid[self.currI][self.currJ]=0
                self.currJ+=1
                self.PositionGrid[self.currI][self.currJ]=1.1
        elif(action==3):#3 is Up
            if(0<self.currI):
                self.PositionGrid[self.currI][self.currJ]=0
                self.currI-=1
                self.PositionGrid[self.currI][self.currJ]=1.1
                
        if(self.currI==0 and self.currJ==self.size-1):
            self.DoneStatus=True
            
    def step(self,action):
        return self.move(action)

In [43]:
class myGridWorldTrainer:
    
    env=[]
    Q=[]
    matrix=[]
    Trajectories=[]
    DirectionalMatrix=[]
    
    def trainModel(self,model):
        env=self.env
        alpha = 0.6
        gamma = 0.9
        Q = np.zeros([env.observation_spaces, env.action_space])
        for episode in range(1,10001):
            done = False
            TotalReward = 0
            state = env.reset()
            while done != True:
                    if(episode<500):
                        action = random.randint(0,3)
                    else:
                        action=np.argmax(Q[state])
                    i,j,state2, reward, done = env.step(action)
                    Q[state,action] += alpha * (reward + gamma* np.max(Q[state2]) - Q[state,action])
                    TotalReward += reward
                    state = state2
        self.Q=Q
        return Q
    
    def getDirections(self,Q):
        matrix=[]
        for i in range(0,25):
            matrix.append(np.argmax(Q[i]))
        matrix=np.reshape(matrix,(5,5))
        DirectionalMatrix=[]
        for i in range(5):
            row=[]
            for j in range(5):
                if(matrix[i][j]==0):
                    row.append('\u2190')
                elif(matrix[i][j]==1):
                    row.append('\u2193')
                elif(matrix[i][j]==2):
                    row.append('\u2192')
                elif(matrix[i][j]==3):
                    row.append('\u2191')
            DirectionalMatrix.append(row)
#         for row in DirectionalMatrix:
#             print(row)
        self.DirectionalMatrix=DirectionalMatrix
        self.matrix=matrix
        return matrix
            
    def getTrajectories(self,matrix,numTrajectories):
        Trajectories=[]

        for iters in range(numTrajectories):
            path=[]
            done=False
            state = self.env.reset()
            TotalReward = 0
            path.append(state)
            i=int(state/self.env.size)
            j=state%self.env.size
            while done != True:
                action=matrix[i][j]
                i,j,state2, reward, done = self.env.step(action)
                TotalReward += reward
                state = state2
                path.append(state)

            Trajectories.append(path)
#         for i in Trajectories:
#             print(i)
        self.Trajectories=Trajectories
        return Trajectories

    def allInOne(self,model,numTrajectories):
        self.env=model
        Q=self.trainModel(model)
        matrix=self.getDirections(Q)
        return self.getTrajectories(matrix,numTrajectories)

In [48]:
sampleGrid=myGridWorld()
sampleGridTrainer=myGridWorldTrainer()
sampleTrajectories=sampleGridTrainer.allInOne(sampleGrid,20)
# for i in sampleTrajectories:
#     print(i)
    
for i in sampleGridTrainer.matrix:
    print(i)
    
for i in sampleGridTrainer.DirectionalMatrix:
    print(i)
    
for i in sampleGridTrainer.Q:
    print(i)

[2 2 2 2 0]
[3 2 3 3 3]
[2 2 2 2 3]
[2 2 2 2 3]
[3 2 3 2 3]
['→', '→', '→', '→', '←']
['↑', '→', '↑', '↑', '↑']
['→', '→', '→', '→', '↑']
['→', '→', '→', '→', '↑']
['↑', '→', '↑', '→', '↑']
[0.38274524 0.36598019 0.65387898 0.37655196]
[0.41567039 0.42248597 0.75874135 0.40843376]
[0.50268074 0.46382517 0.88340777 0.49198721]
[0.62071236 0.61763074 0.99992352 0.62112517]
[0. 0. 0. 0.]
[0.32287256 0.34391871 0.34516795 0.57580205]
[0.39497047 0.39441305 0.60157139 0.39350693]
[0.42869905 0.41287688 0.42362262 0.61699914]
[0.50701622 0.52028551 0.48270782 0.82158807]
[0.50151707 0.47680804 0.51743994 0.99709559]
[0.2909064  0.28469709 0.46487153 0.29473347]
[0.32388907 0.32521332 0.56538986 0.31626697]
[0.35804135 0.37699113 0.65008763 0.38301493]
[0.42554607 0.42233695 0.65248137 0.43228027]
[0.4675377  0.46549837 0.46176162 0.87143596]
[0.24912659 0.25328019 0.42151022 0.26181388]
[0.28846369 0.2941784  0.49150444 0.29665419]
[0.30911394 0.30383354 0.54969247 0.31833513]
[0.38081443 0.

In [None]:
#=========================================================================================================================

#Testing data below

In [181]:
# for i in sampleGridTrainer.DirectionalMatrix:
#     print (i)

['→', '→', '→', '→', '←']
['↑', '→', '→', '↑', '↑']
['→', '→', '→', '↑', '↑']
['↑', '↑', '↑', '↑', '↑']
['→', '→', '→', '→', '↑']


In [153]:
#print(matrix)

[[2 2 2 2 0]
 [2 3 2 2 3]
 [3 3 3 2 3]
 [2 3 3 2 3]
 [3 2 2 2 3]]


In [154]:
# for row in DirectionalMatrix:
#     print(row)

['→', '→', '→', '→', '←']
['→', '↑', '→', '→', '↑']
['↑', '↑', '↑', '→', '↑']
['→', '↑', '↑', '→', '↑']
['↑', '→', '→', '→', '↑']


In [156]:
# import pickle
# mydata = [Q,matrix,DirectionalMatrix]
# outputFile = 'model.data'
# fw = open(outputFile, 'wb')
# pickle.dump(mydata, fw)
# fw.close()

In [159]:
# import pickle
# inputFile = 'model.data'
# fd = open(inputFile, 'rb')
# dataset = pickle.load(fd)
# print (dataset)

[array([[0.43004339, 0.45212881, 0.61558091, 0.45310747],
       [0.46965204, 0.47690185, 0.76270971, 0.48129609],
       [0.52201239, 0.53598079, 0.88957005, 0.51801781],
       [0.64767243, 0.62176735, 0.99984226, 0.62657434],
       [0.        , 0.        , 0.        , 0.        ],
       [0.41831039, 0.40522724, 0.45084596, 0.42004852],
       [0.46369228, 0.43552376, 0.45160894, 0.63119409],
       [0.48420771, 0.48433573, 0.77782329, 0.46722925],
       [0.50262931, 0.49599035, 0.89440985, 0.51293198],
       [0.52018924, 0.52150701, 0.53149694, 0.99996926],
       [0.36048189, 0.37866988, 0.34214642, 0.43221122],
       [0.3736657 , 0.35860765, 0.37346519, 0.5332261 ],
       [0.40898293, 0.39336741, 0.39802421, 0.64443918],
       [0.4060106 , 0.39305184, 0.52710811, 0.41092932],
       [0.4550372 , 0.43258876, 0.45505562, 0.86363268],
       [0.29023405, 0.28830074, 0.3807284 , 0.30712898],
       [0.315256  , 0.3364679 , 0.33275335, 0.44741976],
       [0.37943963, 0.35920061

In [None]:
# \u2190 ←
# \u2191 ↑
# \u2192 →
# \u2193 ↓

In [None]:
#0 is left
#1 is down
#2 is right
#3 is Up

In [169]:
# for i in Trajectories:
#     print(i)

[20, 20, 15, 20, 15, 16, 11, 6, 11, 6, 1, 2, 3, 4]
[20, 20, 15, 16, 11, 6, 1, 2, 3, 4]
[20, 15, 10, 5, 6, 1, 2, 3, 4]
[20, 15, 16, 11, 16, 11, 6, 1, 2, 3, 4]
[20, 15, 20, 15, 16, 11, 6, 1, 2, 3, 4]
[20, 20, 15, 16, 21, 22, 23, 24, 19, 14, 9, 9, 8, 9, 4]
[20, 15, 16, 11, 6, 1, 2, 3, 4]
[20, 15, 20, 15, 10, 5, 6, 5, 10, 15, 16, 11, 6, 11, 6, 1, 2, 3, 4]
[20, 15, 15, 16, 11, 6, 1, 2, 3, 4]
[20, 15, 16, 11, 6, 1, 2, 3, 4]
