# 1.1 Deterministic Environment
---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gym
from gym import spaces

In [None]:
from google.colab import widgets
import time

output_grid = widgets.Grid(1, 1)
for _ in range(5):
    grid = np.zeros((4, 4))
    grid[np.random.randint(4), np.random.randint(4)] = 1
    with output_grid.output_to(0, 0):
        output_grid.clear_cell()
        plt.imshow(grid)
    time.sleep(1)

In [None]:
class GridEnvironment(gym.Env):
    metadata = { 'render.modes': [] }
    
    def __init__(self):
        self.observation_space = spaces.Discrete(16)
        self.action_space = spaces.Discrete(4)
        self.max_timesteps = 15
        
    def reset(self):
        self.timestep = 0
        self.agent_pos = [0, 0]
        self.pos_1 = [0, 3]
        self.pos_2 = [3, 0]
        self.pos_3 = [1, 2]
        self.goal_pos = [3, 3]
        self.state = np.zeros((4, 4))
        self.state[tuple(self.agent_pos)] = 1
        self.state[tuple(self.pos_1)] = 0.2
        self.state[tuple(self.pos_2)] = 0.2
        self.state[tuple(self.pos_3)] = 0.5
        self.state[tuple(self.goal_pos)] = 0.7
        observation = self.state.flatten()
        return observation
    
    def step(self, action):
        if action == 0: # down
            self.agent_pos[0] += 1
        if action == 1: # up
            self.agent_pos[0] -= 1
        if action == 2: # right
            self.agent_pos[1] += 1
        if action == 3: # left
            self.agent_pos[1] -= 1

        self.agent_pos = np.clip(self.agent_pos, 0, 3)
        self.state = np.zeros((4, 4))
        self.state[tuple(self.agent_pos)] = 1
        self.state[tuple(self.pos_1)] = 0.2
        self.state[tuple(self.pos_2)] = 0.2
        self.state[tuple(self.pos_3)] = 0.5
        self.state[tuple(self.goal_pos)] = 0.7
        observation = self.state.flatten()
        
        reward = 0
        if (self.agent_pos == self.pos_1).all():
            reward = -0.5
        if (self.agent_pos == self.pos_2).all():
            reward = -0.5
        if (self.agent_pos == self.pos_3).all():
            reward = 0.5
        if (self.agent_pos == self.goal_pos).all():
            reward = 1.0
        
        self.timestep += 1
        if self.timestep >= self.max_timesteps:
            done = True
        else:
            if (self.agent_pos == self.goal_pos).all():
                done = True
            else:
                done = False
        info.update({self.timestep: (np.clip(self.agent_pos, 0, 3), reward)})
        
        return observation, reward, done, info
        
    def render(self):
        plt.imshow(self.state)

In [None]:
class RandomAgent:
    def __init__(self, env):
        self.env = env
        self.observation_space = env.observation_space
        self.action_space = env.action_space

    def step(self, observation):
        if counter == 0:
            return np.random.choice(self.action_space.n)
        else:
            loc = tuple(list(info.values())[-1])
            if tuple(loc[0])[0] >= 3:
                return 2
            else:
                if tuple(loc[0])[1] >= 3:
                    return 0
                else:
                    if np.random.random() > 0.5:
                        return 2
                    else:
                        return 0

In [None]:
env = GridEnvironment()
agent = RandomAgent(env)
info = {}
counter = 0

obs = env.reset()
done = False

output_grid = widgets.Grid(1, 1)
with output_grid.output_to(0, 0):
    env.render()


while not done:
    action = agent.step(obs)
    obs, reward, done, info = env.step(action)
    counter += 1
    with output_grid.output_to(0, 0):
        output_grid.clear_cell()
        env.render()
    time.sleep(1)

temp = 0
for i in range(len(info)):
    print(f'In the step {tuple(list(info.keys())[i])}, the location is {list(info.values())[i][0]} and the reward is {list(info.values())[i][1]}.')
    temp += list(info.values())[i][1]
print(f'The total reward is {temp}.')

# 1.2 Stochastic Environment
---

In [None]:
class GridEnvironment_Stochastic(gym.Env):
    metadata = { 'render.modes': [] }
    
    def __init__(self):
        self.observation_space = spaces.Discrete(16)
        self.action_space = spaces.Discrete(4)
        self.max_timesteps = 15
        
    def reset(self):
        self.timestep = 0
        self.agent_pos = [0, 0]
        self.pos_1 = [0, 3]
        self.pos_2 = [3, 0]
        self.pos_3 = [1, 2]
        self.goal_pos = [3, 3]
        self.state = np.zeros((4, 4))
        self.state[tuple(self.agent_pos)] = 1
        self.state[tuple(self.pos_1)] = 0.2
        self.state[tuple(self.pos_2)] = 0.2
        self.state[tuple(self.pos_3)] = 0.5
        self.state[tuple(self.goal_pos)] = 0.7
        observation = self.state.flatten()
        return observation
    
    def step(self, action):
        p = np.random.random()
        if action == 0: # down
            if p < 0.85:
                self.agent_pos[0] += 1
            elif p < 0.9:
                self.agent_pos[0] -= 1
            elif p < 0.95:
                self.agent_pos[1] += 1
            else:
                self.agent_pos[1] -= 1
        if action == 1: # up
            if p < 0.85:
                self.agent_pos[0] -= 1
            elif p < 0.9:
                self.agent_pos[0] += 1
            elif p < 0.95:
                self.agent_pos[1] += 1
            else:
                self.agent_pos[1] -= 1
        if action == 2: # right
            if p < 0.85:
                self.agent_pos[1] += 1
            elif p < 0.9:
                self.agent_pos[0] += 1
            elif p < 0.95:
                self.agent_pos[0] -= 1
            else:
                self.agent_pos[1] -= 1
        if action == 3: # left
            if p < 0.85:
                self.agent_pos[1] -= 1
            elif p < 0.9:
                self.agent_pos[0] == 1
            elif p < 0.95:
                self.agent_pos[0] -= 1
            else:
                self.agent_pos[1] += 1

        self.agent_pos = np.clip(self.agent_pos, 0, 3)
        self.state = np.zeros((4, 4))
        self.state[tuple(self.agent_pos)] = 1
        self.state[tuple(self.pos_1)] = 0.2
        self.state[tuple(self.pos_2)] = 0.2
        self.state[tuple(self.pos_3)] = 0.5
        self.state[tuple(self.goal_pos)] = 0.7
        observation = self.state.flatten()
        
        reward = 0
        if (self.agent_pos == self.pos_1).all():
            reward = -0.5
        if (self.agent_pos == self.pos_2).all():
            reward = -0.5
        if (self.agent_pos == self.pos_3).all():
            reward = 0.5
        if (self.agent_pos == self.goal_pos).all():
            reward = 1.0
        
        self.timestep += 1
        if self.timestep >= self.max_timesteps:
            done = True
        else:
            if (self.agent_pos == self.goal_pos).all():
                done = True
            else:
                done = False
        info.update({self.timestep: (np.clip(self.agent_pos, 0, 3), reward)})
        
        return observation, reward, done, info
        
    def render(self):
        plt.imshow(self.state)

In [None]:
env = GridEnvironment_Stochastic()
agent = RandomAgent(env)
info = {}
counter = 0

obs = env.reset()
done = False

output_grid = widgets.Grid(1, 1)
with output_grid.output_to(0, 0):
    env.render()


while not done:
    action = agent.step(obs)
    obs, reward, done, info = env.step(action)
    counter += 1
    with output_grid.output_to(0, 0):
        output_grid.clear_cell()
        env.render()
    time.sleep(1)

temp = 0
for i in range(len(info)):
    print(f'In the step {list(info.keys())[i]}, the location is {tuple(list(info.values())[i][0])} and the reward is {list(info.values())[i][1]}.')
    temp += list(info.values())[i][1]
print(f'The total reward is {temp}.')