In [2]:
#Custom Environment
import numpy as np
from PIL import Image
import cv2
import matplotlib.pyplot as plt
from matplotlib import style
import time
import numpy as np
import random


style.use("ggplot")


class Blob():
    def __init__(self, SIZE = 10):
        self.size = SIZE
        self.x = np.random.randint(0, SIZE)
        self.y = np.random.randint(0, SIZE)

    def __str__(self):
        return f"{self.x}, {self.y}"

    def __sub__(self, other):
        return (self.x-other.x, self.y-other.y)

    def act(self, choice, diagonal = False):
        '''
        Gives us 4 total movement options. (0,1,2,3)
        '''
        if diagonal:

            if choice == 0:
                self.move(x=1, y=1)
            elif choice == 1:
                self.move(x=-1, y=-1)
            elif choice == 2:
                self.move(x=-1, y=1)
            elif choice == 3:
                self.move(x=1, y=-1)

        else:
            if choice == 0:
                self.move(x=0, y=1)
            elif choice == 1:
                self.move(x=0, y=-1)
            elif choice == 2:
                self.move(x=-1, y=0)
            elif choice == 3:
                self.move(x=1, y=0)


    def move(self, x=-100, y=-100):

        if x == -100:
            self.x += np.random.randint(-1, 2)
        else:
            self.x += x

        if y == -100:
            self.y += np.random.randint(-1, 2)
        else:
            self.y += y

        if self.x < 0:
            self.x = 0
        elif self.x > self.size-1:
            self.x = self.size-1
        if self.y < 0:
            self.y = 0
        elif self.y > self.size-1:
            self.y = self.size-1


class ENVIRONMENT():



    def __init__(self, num_player=1, num_enemy=1, num_food=1, size = 10, diagonal = False):
        self.size = size
        self.naction = 4
        self.diagonal = diagonal
        self.num_enemy = num_enemy
        self.num_food = num_food
        self.player = Blob(size)
        self.enemy = [Blob() for _ in range(self.num_enemy)]
        self.food = [Blob() for _ in range(self.num_food)]
        self.reward = 0
        self.colors = {1: (255, 0, 0),
         2: (0, 255, 0),
         3: (0, 0, 255)}
        self.px,self.py = self.player.x,self.player.y
        self.ex,self.ey = [self.enemy[iter].x for iter in range(self.num_enemy)], [self.enemy[iter].y for iter in range(self.num_enemy)]
        self.fx,self.fy = [self.food[iter].x for iter in range(self.num_food)], [self.food[iter].y for iter in range(self.num_food)]


    def startover(self, newpos=False):

        self.player.x, self.player.y = self.px, self.py
        for iter in range(self.num_enemy):
            self.enemy[iter].x, self.enemy[iter].y = self.ex[iter], self.ey[iter]
        for iter in range(self.num_food):
            self.food[iter].x, self.food[iter].y = self.fx[iter], self.fy[iter]
        if newpos == True:
            self.player = Blob(self.size)
        self.reward = 0

        return (self.player.x, self.player.y), self.reward, False

    def step(self, action):

        self.player.act(action, self.diagonal)
        self.reward = self.calculate_reward()
        return (self.player.x, self.player.y), self.reward

    def calculate_reward(self):

        if self.player.x in [self.enemy[iter].x for iter in range(self.num_enemy)] and self.player.y in [self.enemy[iter].y for iter in range(self.num_enemy)]:
            return -100, True

        if self.player.x in [self.food[iter].x for iter in range(self.num_food)] and self.player.y in [self.food[iter].y for iter in range(self.num_food)]:
            return 100, True

        else:
            return -1, False


    def render(self,renderTime=100):

        env = np.zeros((self.size, self.size, 3), dtype=np.uint8)
        for iter in range(self.num_food):
            env[self.food[iter].x][self.food[iter].y] = self.colors[2]
        for iter in range(self.num_enemy):
            env[self.enemy[iter].x][self.enemy[iter].y] = self.colors[3]
        env[self.player.x][self.player.y] = self.colors[1]
        img = Image.fromarray(env, 'RGB')
        img = img.resize((300, 300))
        cv2.imshow("image", np.array(img))
        cv2.waitKey(renderTime)
        # cv2.destroyAllWindows()

    def sample_action(self):
        return np.random.randint(0, self.naction)

In [3]:
env = ENVIRONMENT(diagonal=True, size=10, num_enemy = 3, num_food = 1)
episodes = 100000
nS = 100
nA = 4
learning_rate = 0.01
gamma = 0.9
epsilon = 0.95
"""
Actions 
diagonal = True
0 = down_right
1 = up_left
2 = up_right
3 = down_left
When space is not available action = action.split('_')[0]

Environment
player = Blue
enemy = red
goal = green

If a player is on 
an enemy reward at that time step = -100
the goal reward at that time step = 100
for every other time step reward is = -1
"""

"\nActions \ndiagonal = True\n0 = down_right\n1 = up_left\n2 = up_right\n3 = down_left\nWhen space is not available action = action.split('_')[0]\n\nEnvironment\nplayer = Blue\nenemy = red\ngoal = green\n\nIf a player is on \nan enemy reward at that time step = -100\nthe goal reward at that time step = 100\nfor every other time step reward is = -1\n"

In [4]:
def E_policy(q,s,epsilon):
    r = random.random()
    if r<epsilon:
        a = env.sample_action()
    else:
        a = np.argmax(q[s])
    return a

In [5]:
Q = np.zeros([nS,nA])
for e in range(episodes):
    pos_i,r0,term = env.startover(newpos=True)
    s_i = pos_i[0]*10 + pos_i[1]
    a_i = E_policy(Q,s_i,epsilon)
    while term == False:
        pos_f,(rf,term) = env.step(a_i)
        s_f = pos_f[0]*10 + pos_f[1]
        a_f = E_policy(Q,s_f,epsilon)
        Q[s_i,a_i] = (1-learning_rate)*Q[s_i,a_i] + learning_rate*(rf + gamma*Q[s_f,a_f])
        a_i = a_f
        s_i = s_f                                                           
    if (e+1)%500 == 0:
        print('current episode = ',e)
    epsilon = epsilon*0.9998 

current episode =  499
current episode =  999
current episode =  1499
current episode =  1999
current episode =  2499
current episode =  2999
current episode =  3499
current episode =  3999
current episode =  4499
current episode =  4999
current episode =  5499
current episode =  5999
current episode =  6499
current episode =  6999
current episode =  7499
current episode =  7999
current episode =  8499
current episode =  8999
current episode =  9499
current episode =  9999
current episode =  10499
current episode =  10999
current episode =  11499
current episode =  11999
current episode =  12499
current episode =  12999
current episode =  13499
current episode =  13999
current episode =  14499
current episode =  14999
current episode =  15499
current episode =  15999
current episode =  16499
current episode =  16999
current episode =  17499
current episode =  17999
current episode =  18499
current episode =  18999
current episode =  19499
current episode =  19999
current episode =  204

In [6]:
pol = np.zeros(nS)
for s in range(nS):
    pol[s] = np.argmax(Q[s])

In [7]:
def play():
    pos_i,k,ter = env.startover(newpos=True)
    env.render()
    T = False
    i = 0
    while T == False and i<=20:   
        s = pos_i[0]*10 + pos_i[1]
        print(pos_i)
        pos_i,R = env.step(pol[s])
        print(R)
        T=R[1]
        env.render(500)
        i = i+1
    cv2.destroyAllWindows()

In [8]:
cv2.destroyAllWindows()

In [9]:
for i in range(20):
    play()    

(3, 4)
(-1, False)
(4, 3)
(-1, False)
(5, 2)
(100, True)
(7, 3)
(-1, False)
(8, 4)
(-1, False)
(9, 3)
(-1, False)
(9, 2)
(-1, False)
(8, 1)
(-1, False)
(7, 2)
(-1, False)
(6, 1)
(-1, False)
(5, 0)
(100, True)
(1, 8)
(-1, False)
(2, 7)
(-1, False)
(3, 6)
(-1, False)
(4, 5)
(-1, False)
(5, 4)
(-1, False)
(6, 3)
(-1, False)
(5, 2)
(100, True)
(0, 4)
(-1, False)
(1, 3)
(-1, False)
(2, 2)
(-1, False)
(3, 1)
(-1, False)
(2, 0)
(-1, False)
(3, 0)
(100, True)
(0, 6)
(-1, False)
(1, 5)
(-1, False)
(2, 4)
(-1, False)
(1, 3)
(-1, False)
(2, 2)
(-1, False)
(3, 1)
(-1, False)
(2, 0)
(-1, False)
(3, 0)
(100, True)
(7, 1)
(-1, False)
(6, 0)
(-1, False)
(5, 0)
(100, True)
(7, 7)
(-1, False)
(8, 6)
(-1, False)
(9, 5)
(-1, False)


KeyboardInterrupt: 

In [18]:
Q

array([[-1.93410731, -1.905191  , -1.9123699 , -1.9       ],
       [-2.71      , -2.71      , -2.79956502, -2.71      ],
       [-1.9       , -2.03930847, -1.91726303, -1.9       ],
       [-1.        , -1.10148343, -1.09113265, -1.        ],
       [-0.1       , -0.1       , -0.19      ,  0.        ],
       [-1.03483322, -1.        , -1.09113265, -1.        ],
       [-1.92910261, -1.9       , -1.91108803, -1.9       ],
       [-2.80015521, -2.71      , -2.79905415, -2.71      ],
       [-3.55333442, -3.439     , -3.46513682, -3.439     ],
       [-4.17068954, -4.0951    , -4.18516687, -4.0951    ],
       [-1.13193416, -1.0901478 , -1.03322305, -1.        ],
       [-1.9       , -1.9       , -1.91528209, -1.9       ],
       [-1.        , -1.00482109, -1.12281908, -1.00371429],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [-1.        , -1.        , -1.        , -1.        ],
       [-0.1       ,  0.        ,  0.        ,  0.        ],
       [-1.12691526, -1.