In [17]:
import gym
from gym import spaces
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time
plt.rcParams["figure.figsize"] = [6,6]

In [43]:
class GridEnvironment(gym.Env):
    
    def __init__(self):
        self.observation_space = spaces.Discrete(16)
        self.action_space = spaces.Discrete(4)
        self.max_timesteps = 10
        self.timestep = 0
        self.done = False
        
        self.pos = {'knight':[3,0],
                   'mon1':[1,2],
                   'mon2':[2,1],
                   'chest':[0,3],
                   'coin1':[3,1],
                   'coin2':[1,3]}
        
        self.mon1_death = False
        self.mon2_death = False
        self.coin1_collected = False
        self.coin2_collected = False
        
        # loading assets for rendering
        
        chars = mpimg.imread('Dungeon_Character.png')
        tiles = mpimg.imread('Dungeon_Tileset.png')
        assets = {}
        assets['knight'] = chars[:16,16*5:16*6]
        assets['mon1'] = chars[16:,16*6:]
        assets['mon2'] = chars[16:,16*5:16*6]
        assets['chest'] = tiles[16*8:16*9,16*4:16*5]
        assets['bones'] = tiles[16*7:16*8,16*7:16*8]
        torch = tiles[16*9:,:16]
        assets['coin1'] = tiles[16*8:16*9,16*6:16*7]
        assets['coin2'] = assets['coin1']
        stage = np.concatenate((tiles[:16*3,:16*6],
                                np.flip(tiles[16*2:16*3,:16*6],axis=1),
                                tiles[16*3:16*5,:16*6]),axis=0)
        stage[:16,16:16*2] = np.where(torch != 0,torch,stage[:16,16:16*2])
        stage[:16,16*4:16*5] = np.where(torch != 0,torch,stage[:16,16*4:16*5])
        self.stage = stage
        self.assets = assets
        self.archives = []
        
    def archive(self):
        instnace = [(self.pos['knight'])]
        self.archives.append(tuple(instance))

    def reset(self):
        self.timestep = 0
        self.pos = {'knight':[3,0],
                   'mon1':[1,2],
                   'mon2':[2,1],
                   'chest':[0,3],
                   'coin1':[3,1],
                   'coin2':[1,3]}
        self.archive = []
        self.done = False
        
        self.mon1_death = False
        self.mon2_death = False
        self.coin1_collected = False
        self.coin2_collected = False
        
        return self.pos['knight'][0]*4 + self.pos['knight'][1]
    
    def step(self, action, stochastic = False):
        '''
        0 - up
        1 - down
        2 - right
        3 - left
        '''
        if stochastic == True:
            action_prop = 0.6
            props = [0.1,0.1,0.1,0.1]
            props[action] += 0.6
            action = np.random.choice(4, 1, p=props)[0]
            
        if action == 0:
            self.pos['knight'][0] += 1
        elif action == 1:
            self.pos['knight'][0] -= 1
        elif action == 2:
            self.pos['knight'][1] += 1
        elif action == 3:
            self.pos['knight'][1] -= 1

        self.pos['knight'] = list(np.clip(self.pos['knight'], 0, 3))
        
        reward = 0
        if self.pos['knight'] == self.pos['chest']:
            reward = 10
            self.done = True
            
        elif self.pos['knight'] == self.pos['mon1']:
            if self.mon1_death:
                pass
            else:
                reward = -10
                self.mon1_death = True
                
        elif self.pos['knight'] == self.pos['mon2']:
            if self.mon2_death:
                pass
            else:
                reward = -10
                self.mon2_death = True
        elif self.pos['knight'] == self.pos['coin1']:
            if self.coin1_collected:
                pass
            else:
                reward = 2
                self.coin1_collected = True
        elif self.pos['knight'] == self.pos['coin2']:
            if self.coin2_collected:
                pass
            else:
                reward = 2
                self.coin2_collected = True
        
        self.timestep += 1
        if self.timestep == self.max_timesteps:
            self.done = True
            
        self.archive()
        
        return self.pos['knight'][0]*4 + self.pos['knight'][1], reward
    
    def project(self,pos):
        instance = np.copy(self.stage)
        for item in ['mon1','mon2','chest','coin1','coin2','knight']:
            item_asset = item
            if item == 'mon1' and self.mon1_death:
                item_asset = 'bones'
            elif item == 'mon2' and self.mon2_death:
                item_asset = 'bones'
            elif item == 'coin1' and self.coin1_collected:
                continue
            elif item == 'coin2' and self.coin2_collected:
                continue
            x = np.where(self.assets[item_asset] != 0, self.assets[item_asset], 
                       instance[16*(pos[item][0]+1):16*(pos[item][0]+2),
                                16*(pos[item][1]+1):16*(pos[item][1]+2)])
            instance[16*(pos[item][0]+1):16*(pos[item][0]+2),
                     16*(pos[item][1]+1):16*(pos[item][1]+2)] = x
        return instance
    
    def render(self):
        for instance in self.archives:
            instance = self.project(instance)
            plt.figure()
            plt.axis('off')
            plt.imshow(instance,interpolation='none',aspect='equal')
        

In [44]:
class RandomAgent:
    def __init__(self, env):
        self.env = env
        self.observation_space = env.observation_space
        self.action_space = env.action_space

    def step(self, pos):
        return np.random.choice(self.action_space.n)

class Qlearning:
    def __init__(self,env):
        self.env = env
        self.observation_space = env.observation_space.n
        self.action_space = env.action_space.n
        self.table = np.zeros((self.observation_space,self.action_space))
        
        self.num_epochs = 10000
        self.max_steps = 10
        self.learning_rate = 0.1
        self.discount_rate = 0.99
        
        self.exploration_rate = 1
        self.max_exploration_rate = 1
        self.min_exploration_rate = 0.01
        self.exploration_decay_rate = 0.001
        
    def step(self,state):
        if np.random.choice(2,1,p=[self.exploration_rate,1-self.exploration_rate])[0]:
            action = np.argmax(self.table[state,:])
        else:
            action = self.env.action_space.sample()
        
        return action

In [45]:
env = GridEnvironment()
agent = RandomAgent(env)

pos = env.reset()

while not env.done:
    action = agent.step(pos)
    obs, reward = env.step(action,stochastic=False)
    print(env.archives)

TypeError: 'list' object is not callable

In [21]:
env.archive


[[4, 1],
 [2, 1],
 [1, 1],
 [1, 0],
 [1, 1],
 [0, 1],
 [1, 1],
 [0, 1],
 [0, 2],
 [0, 2]]

In [181]:
env = GridEnvironment()
agent = Qlearning(env)

rewards = []
for epoch in range(agent.num_epochs):
    state = env.reset()
    total_reward = 0

    while not env.done:
        action = agent.step(state)
        new_state, reward = env.step(action)
        agent.table[state,action] = agent.table[state,action] * (1 - agent.learning_rate) + agent.learning_rate * (reward + agent.discount_rate * np.max(agent.table[new_state,:]))
        state = new_state
        total_reward += reward
    rewards.append(total_reward)
    agent.exploration_rate = agent.min_exploration_rate + (agent.max_exploration_rate - agent.min_exploration_rate) * np.exp(-agent.exploration_decay_rate*epoch)

In [182]:
rewards_average = np.split(np.array(rewards),agent.num_epochs/1000)

In [183]:
count = 1000
for r in rewards_average:
    print(count, ": " , str(sum(r/1000)))
    count += 1000

1000 :  -3.3240000000000673
2000 :  -0.7480000000000006
3000 :  0.9820000000000009
4000 :  1.5960000000000014
5000 :  1.6060000000000012
6000 :  1.8640000000000014
7000 :  1.8660000000000014
8000 :  1.8700000000000014
9000 :  1.9280000000000015
10000 :  1.8600000000000017


In [184]:
agent.table

array([[22.07935514,  3.49331469,  1.03415737,  2.36444952],
       [ 1.34483048,  0.25404876,  4.68753699,  0.25304081],
       [-2.1868512 ,  1.23579928,  6.86189404,  0.29817734],
       [ 0.        ,  0.        ,  0.        ,  0.        ],
       [20.08567741, 16.60187318, 20.09008949, 20.10274538],
       [19.06507249,  2.17908886, -3.81280307, 19.99360818],
       [ 6.82745377,  1.88534997,  8.55765023,  7.65527854],
       [ 1.86805141,  8.33228183,  1.53842236,  3.04234139],
       [20.11313845, 20.10326646, 17.12346994, 20.10208826],
       [20.70009692, 20.70899773, 20.76923978, 20.78664823],
       [19.9248404 , -3.79446896, 18.98068661, 13.96379678],
       [ 8.80743951,  1.46397859,  2.94051492, 23.07369438],
       [20.13164214, 19.96136615, 20.23525258, 20.10598943],
       [19.91786677, 10.79083283, 19.80951589, 20.10473615],
       [19.96388338, 19.95018932, 19.95722711, 19.96920141],
       [16.40134336, 14.88552372, 19.1947246 , 19.97760594]])

In [185]:
rewards_average[9]

array([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, -8,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2, -8,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2, -8,  2,  2,  2, -8,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2, -8,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2

In [14]:
a = tuple([2,1])

In [15]:
a

(2, 1)