In [3]:
import copy
import time
import numpy as np
import random

In [None]:
class GridWorld:

    def __init__(self, x_max, y_max):

        self.x_max = x_max
        self.y_max = y_max
        self.filed_type = {
            "N": 0,  # 通常
            "G": 1,  # ゴール
            "W": 2,  # 壁
            "H": 3,  # 他の人間
        }
        self.actions = {
            "UP": 0,
            "DOWN": 1,
            "LEFT": 2,
            "RIGHT": 3
        }
        self.map = np.zeros((y_max,x_max))
        self.map[::2] = 2 
        self.map[:, ::2] = 0
        self.map[0,0] = 1

        self.zero_list = list(zip(*np.where( self.map < 1)))

    def step(self, action):
        """
            行動の実行
            状態, 報酬、ゴールしたかを返却
        """
        to_x, to_y = copy.deepcopy(self.agent_pos)

        # 移動可能かどうかの確認。移動不可能であれば、ポジションはそのままにマイナス報酬
        if self._is_possible_action(to_x, to_y, action) == False:
            return self.agent_pos, -10, False

        if action == self.actions["UP"]:
            to_y += -1
        elif action == self.actions["DOWN"]:
            to_y += 1
        elif action == self.actions["LEFT"]:
            to_x += -1
        elif action == self.actions["RIGHT"]:
            to_x += 1

        is_goal = self._is_end_episode(to_x, to_y)  # エピソードの終了の確認
        reward = self._compute_reward(to_x, to_y)
        self.agent_pos = to_x, to_y
        return self.agent_pos, reward, is_goal

    def _is_end_episode(self, x, y):
        """
            x, yがエピソードの終了かの確認。
        """
        if self.map[y,x] == self.filed_type["G"]: # ゴール
            return True
        #elif self.map[y][x] == self.filed_type["T"]:    # トラップ
            return True
        else:
            return False

    def _is_wall(self, x, y):
        """
            x, yが壁または人間かどうかの確認
        """
        if self.map[y,x] == self.filed_type["W"]:
            return True
        elif self.map[y,x] == self.filed_type["H"]:
            return True
        else:
            return False

    def _is_possible_action(self, x, y, action):
        """
            実行可能な行動かどうかの判定
        """
        to_x = x
        to_y = y

        if action == self.actions["UP"]:
            #print("上に行った")
            to_y += -1
            #print(to_y,to_x)
        elif action == self.actions["DOWN"]:
            #print("下に行った")
            to_y += 1
            #print(to_y,to_x)
        elif action == self.actions["LEFT"]:
            #print("左に行った")
            to_x += -1
            #print(to_y,to_x)
        elif action == self.actions["RIGHT"]:
            #print("右に行った")
            to_x += 1
            #print(to_y,to_x)

        if self.map.shape[0] <= to_y or 0 > to_y:
            #print("y行き過ぎ")
            return False
        elif self.map.shape[1] <= to_x or 0 > to_x:
            #print("x行き過ぎ")
            return False
        elif self._is_wall(to_x, to_y):
            #print("壁だった")
            #print(to_y,to_x)
            return False

        return True

    def _compute_reward(self, x, y):
        if self.map[y,x] == self.filed_type["N"]:
            return 0
        elif self.map[y,x] == self.filed_type["G"]:
            return 100
        #elif self.map[y,x] == self.filed_type["T"]:
            return -100

    def reset(self):
        self.agent_pos = self.start_pos
        return self.start_pos


: 

In [22]:
X_MAX = 15
Y_MAX = 18
START_X = X_MAX - 1     # 端からスタートさせる
START_Y = Y_MAX - 1
POPULATION = 2

# 定数
NB_EPISODE = 1000   # エピソード数
EPSILON = .1    # 探索率
ALPHA = .1      # 学習率
GAMMA = .90     # 割引率
ACTIONS = np.arange(4)  # 行動の集合

class Summon:

    def __init__(self, zero_list, population=2):
        self.agents = self.__generate_agents(zero_list, population)

    def __generate_agents(self, zero_list, population):
        agents = []
        for id in range(population):
            ini_state = random.choice(zero_list) # 初期状態（エージェントのスタート地点の位置）
            agents.append(
                QLearningAgent(
                    alpha=ALPHA,
                    gamma=GAMMA,
                    epsilon=EPSILON,
                    actions=ACTIONS,
                    observation=ini_state))
        times = []
        self.is_end_episode = False  # エージェントがゴールしてるかどうか？
        return agents


grid_env = GridWorld(   # grid worldの環境の初期化
    x_max=X_MAX,
    y_max=Y_MAX,
    start_x=START_X,
    start_y=START_Y)

summon = Summon(
    zero_list=grid_env.zero_list,
    population= POPULATION)

print(grid_env.zero_list)

[(0, 2), (0, 4), (0, 6), (0, 8), (0, 10), (0, 12), (0, 14), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (2, 0), (2, 2), (2, 4), (2, 6), (2, 8), (2, 10), (2, 12), (2, 14), (3, 0), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14), (4, 0), (4, 2), (4, 4), (4, 6), (4, 8), (4, 10), (4, 12), (4, 14), (5, 0), (5, 1), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (6, 0), (6, 2), (6, 4), (6, 6), (6, 8), (6, 10), (6, 12), (6, 14), (7, 0), (7, 1), (7, 2), (7, 3), (7, 4), (7, 5), (7, 6), (7, 7), (7, 8), (7, 9), (7, 10), (7, 11), (7, 12), (7, 13), (7, 14), (8, 0), (8, 2), (8, 4), (8, 6), (8, 8), (8, 10), (8, 12), (8, 14), (9, 0), (9, 1), (9, 2), (9, 3), (9, 4), (9, 5), (9, 6), (9, 7), (9, 8), (9, 9), (9, 10), (9, 11), (9, 12), (9, 13), (9, 14), (10, 0), (10, 2), (10, 4), (10, 6), (10, 8), (1

In [19]:
class QLearningAgent:
    """
        Q学習 エージェント
    """

    def __init__(
            self,
            alpha=.2,
            epsilon=.1,
            gamma=.99,
            actions=None,
            observation=None):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.reward_history = []
        self.actions = actions
        self.observation = observation
        self.state = str(observation)
        self.ini_state = str(observation)
        self.previous_state = None
        self.previous_action = None
        self.q_values = self._init_q_values()

    def _init_q_values(self):
        """
           Q テーブルの初期化
        """
        q_values = {}
        q_values[self.state] = np.repeat(0.0, len(self.actions))
        return q_values

    def init_state(self):
        """
            状態の初期化
        """
        self.previous_state = copy.deepcopy(self.ini_state)
        self.state = copy.deepcopy(self.ini_state)
        return self.state

    def act(self):
        # ε-greedy選択
        if np.random.uniform() < self.epsilon:  # random行動
            action = np.random.randint(0, len(self.q_values[self.state]))
        else:   # greedy 行動
            action = np.argmax(self.q_values[self.state])

        self.previous_action = action
        return action

    def observe(self, next_state, reward=None):
        """
            次の状態と報酬の観測
        """
        next_state = str(next_state)
        if next_state not in self.q_values:  # 始めて訪れる状態であれば
            self.q_values[next_state] = np.repeat(0.0, len(self.actions))

        self.previous_state = copy.deepcopy(self.state)
        self.state = next_state

        if reward is not None:
            self.reward_history.append(reward)
            self.learn(reward)

    def learn(self, reward):
        """
            Q値の更新
        """
        q = self.q_values[self.previous_state][self.previous_action]  # Q(s, a)
        max_q = max(self.q_values[self.state])  # max Q(s')
        # Q(s, a) = Q(s, a) + alpha*(r+gamma*maxQ(s')-Q(s, a))
        self.q_values[self.previous_state][self.previous_action] = q + \
            (self.alpha * (reward + (self.gamma * max_q) - q))


In [24]:
a=[]
for e in range(10):
    a.append(False)
print(a)

[False, False, False, False, False, False, False, False, False, False]
