In [4]:
import torch
x = torch.rand(5, 3)
print(x)

tensor([[0.5584, 0.3591, 0.2997],
        [0.4296, 0.9893, 0.0460],
        [0.8746, 0.3794, 0.8933],
        [0.5430, 0.7232, 0.7249],
        [0.3851, 0.9902, 0.8339]])


In [5]:
import random
import numpy as np
import matplotlib.pyplot as plt

In [23]:
# Q-Learning test: 1 agent start on a 2D 30 by 30 grid, The End
# with 4 actions (up, down, left, right)
# and 1 goal (bottom right corner)
# starting from the top left corner (0, 0)
# with a reward of 1 for reaching the goal
# and a reward of -1 for hitting the walls
# and a reward of -0.01 for each step taken
# and a discount factor of 0.9

class TheEnd:
    def __init__(self, size=30, goal=(29, 29), walls=None):
        self.size = size
        self.goal_xy = goal
        self.walls = set(walls) if walls else set()
        self.player_xy = (0, 0)
        self.goal_reached = False

    def reset(self):
        """ゲームをリセットして、スタート地点 (0,0) に戻す。"""
        self.player_xy = (0, 0)
        self.goal_reached = False
        return self.player_xy 

    def is_valid(self, nx, ny):
        """範囲内かつ壁でなければ True を返す。"""
        if nx < 0 or nx >= self.size or ny < 0 or ny >= self.size:
            return False
        if (nx, ny) in self.walls:
            return False
        return True

    def step(self, action):
        """
        action: 0=up, 1=down, 2=left, 3=right
        Returns: (next_state, reward, done, info)
        """
        x, y = self.player_xy
        if action == 0:   # up
            nx, ny = x, y - 1
        elif action == 1: # down
            nx, ny = x, y + 1
        elif action == 2: # left
            nx, ny = x - 1, y
        elif action == 3: # right
            nx, ny = x + 1, y
        else:
            return self.player_xy, 0.0, False, {}

        reward = -0.01 
        
        # 移動先が有効かチェック（壁や外枠に当たる場合は移動させず罰則）
        if not self.is_valid(nx, ny):
            reward = -1.0
            # プレイヤー座標は更新しない（動かない）
            next_state = self.player_xy
        else:
            # 有効なら座標を更新
            self.player_xy = (nx, ny)
            next_state = (nx, ny)
        
        # ゴール判定
        if next_state == self.goal_xy:
            reward = 1.0
            self.goal_reached = True
            done = True
        else:
            done = False

        return next_state, reward, done, {}


def train_q_learning(env, num_episodes=500, alpha=0.1, gamma=0.9, epsilon=0.1):
    # Q テーブルの初期化
    Q = np.zeros((env.size, env.size, 4))
    all_episode_rewards = []
    
    # 追加: 各エピソードの軌跡を保存するリスト
    episode_paths = []

    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0.0
        
        # 今のエピソードで訪れた (x, y) のリスト
        path = []

        while not done:
            path.append(state)  # 現在位置を記録
            
            x, y = state
            # ε-greedy で行動を選択
            if random.random() < epsilon:
                action = random.randint(0, 3)
            else:
                action = np.argmax(Q[x, y, :])
            
            next_state, reward, done, _ = env.step(action)
            nx, ny = next_state
            total_reward += reward

            # Q 値の更新
            current_q = Q[x, y, action]
            max_next_q = np.max(Q[nx, ny, :])
            new_q = current_q + alpha * (reward + gamma * max_next_q - current_q)
            Q[x, y, action] = new_q

            state = next_state

        # エピソード終了時の最終位置も記録しておく
        path.append(state)
        all_episode_rewards.append(total_reward)
        episode_paths.append(path)

    # 学習後、各エピソードの軌跡を画像として保存
    for episode_index, path in enumerate(episode_paths):
        episode_total_reward = all_episode_rewards[episode_index]
        plt.figure()  # 新しい Figure を作成
        # path の x 座標、y 座標をそれぞれ取り出し
        xs = [p[0] for p in path]
        ys = [p[1] for p in path]

        # 壁の描画（散布図）: 壁が多い場合は描画に時間がかかるので注意
        wall_xs = [w[0] for w in env.walls]
        wall_ys = [w[1] for w in env.walls]
        plt.scatter(wall_xs, wall_ys, marker='s', color='black')  # 黒色で壁を描画

        # エージェントの通ったルートを描画
        plt.plot(xs, ys, marker='o', linestyle='-')
        
        # ゴールをわかりやすく描画
        # (デフォルトの色・スタイルに任せるために明示設定はしない)
        gx, gy = env.goal_xy
        plt.scatter(gx, gy, marker='X', s=200)  # ゴール

        # スタート位置を強調表示（(0,0) なら最初の要素 path[0] など）
        sx, sy = path[0]
        plt.scatter(sx, sy, marker='s', s=100)  # スタート

        # 軸の範囲（少し余裕を持たせる）
        plt.xlim(-1, env.size)
        plt.ylim(-1, env.size)

        plt.title(f"Episode {episode_index} Path, Reward: {episode_total_reward}")
        plt.xlabel("X")
        plt.ylabel("Y")
        plt.grid(True)
        plt.xticks(range(env.size))
        plt.yticks(range(env.size))
        plt.savefig(f"./pic/episode_{episode_index}.png")  # ファイル名を指定
        plt.close()  # Figure を閉じる

    return Q, all_episode_rewards


In [28]:
# delete everything in the pic directory
import os
import shutil
if os.path.exists("./pic"):
    shutil.rmtree("./pic")
os.makedirs("./pic", exist_ok=True)

walls_pig = [(9, 3), (10, 3), (21, 3), (9, 4), (10, 4), (11, 4), (21, 4), (8, 5), (9, 5), (10, 5), (11, 5), (12, 5), (19, 5), (20, 5), (21, 5), (22, 5), (8, 6), (9, 6), (10, 6), (11, 6), (12, 6), (19, 6), (20, 6), (21, 6), (22, 6), (8, 7), (9, 7), (10, 7), (11, 7), (12, 7), (13, 7), (14, 7), (15, 7), (16, 7), (17, 7), (18, 7), (19, 7), (20, 7), (21, 7), (22, 7), (8, 8), (9, 8), (10, 8), (18, 8), (19, 8), (20, 8), (21, 8), (22, 8), (7, 9), (8, 9), (21, 9), (22, 9), (23, 9), (5, 10), (6, 10), (7, 10), (23, 10), (24, 10), (4, 11), (5, 11), (6, 11), (7, 11), (17, 11), (18, 11), (19, 11), (20, 11), (21, 11), (22, 11), (23, 11), (24, 11), (25, 11), (4, 12), (5, 12), (7, 12), (17, 12), (19, 12), (21, 12), (23, 12), (25, 12), (4, 13), (17, 13), (18, 13), (19, 13), (20, 13), (21, 13), (22, 13), (23, 13), (25, 13), (3, 14), (4, 14), (17, 14), (25, 14), (2, 15), (3, 15), (24, 15), (25, 15), (2, 16), (3, 16), (20, 16), (22, 16), (24, 16), (2, 17), (3, 17), (24, 17), (2, 18), (3, 18), (23, 18), (2, 19), (4, 19), (5, 19), (22, 19), (23, 19), (2, 20), (5, 20), (6, 20), (7, 20), (20, 20), (21, 20), (22, 20), (2, 21), (7, 21), (8, 21), (9, 21), (10, 21), (11, 21), (12, 21), (13, 21), (14, 21), (15, 21), (16, 21), (17, 21), (18, 21), (19, 21), (20, 21), (21, 21), (22, 21), (23, 21), (2, 22), (19, 22), (21, 22), (22, 22), (23, 22), (2, 23), (19, 23), (21, 23), (22, 23), (23, 23), (2, 24), (22, 24), (23, 24), (2, 25)]
walls_medium_difficulty = [(11, 0), (13, 0), (20, 0), (21, 0), (22, 0), (23, 0), (11, 1), (13, 1), (22, 1), (9, 2), (10, 2), (11, 2), (12, 2), (13, 2), (0, 3), (1, 3), (2, 3), (3, 3), (9, 3), (11, 3), (16, 3), (17, 3), (18, 3), (19, 3), (26, 3), (3, 4), (4, 4), (9, 4), (11, 4), (16, 4), (19, 4), (20, 4), (21, 4), (26, 4), (3, 5), (4, 5), (9, 5), (10, 5), (11, 5), (15, 5), (16, 5), (21, 5), (26, 5), (0, 6), (1, 6), (2, 6), (3, 6), (15, 6), (21, 6), (26, 6), (15, 7), (25, 7), (26, 7), (15, 8), (24, 8), (25, 8), (5, 9), (6, 9), (7, 9), (8, 9), (15, 9), (16, 9), (3, 10), (4, 10), (5, 10), (8, 10), (12, 10), (13, 10), (14, 10), (15, 10), (16, 10), (3, 11), (8, 11), (11, 11), (12, 11), (15, 11), (16, 11), (3, 12), (8, 12), (11, 12), (16, 12), (24, 12), (25, 12), (26, 12), (27, 12), (3, 13), (8, 13), (11, 13), (18, 13), (23, 13), (24, 13), (27, 13), (28, 13), (3, 14), (8, 14), (11, 14), (18, 14), (22, 14), (23, 14), (28, 14), (29, 14), (3, 15), (8, 15), (11, 15), (12, 15), (18, 15), (22, 15), (29, 15), (12, 16), (18, 16), (22, 16), (23, 16), (13, 17), (18, 17), (23, 17), (17, 18), (18, 18), (23, 18), (17, 19), (23, 19), (24, 19), (6, 21), (7, 21), (8, 21), (9, 21), (3, 22), (4, 22), (5, 22), (6, 22), (9, 22), (10, 22), (2, 23), (3, 23), (10, 23), (18, 23), (19, 23), (20, 23), (21, 23), (22, 23), (10, 24), (18, 24), (22, 24), (23, 24), (9, 25), (10, 25), (17, 25), (23, 25), (5, 26), (8, 26), (9, 26), (17, 26), (23, 26), (5, 27), (6, 27), (7, 27), (17, 27), (23, 27), (17, 28), (18, 28), (22, 28), (23, 28), (18, 29), (21, 29), (22, 29)]
env = TheEnd(size=30, goal=(29, 29), walls=walls_medium_difficulty)
learned_Q, rewards_history = train_q_learning(env, num_episodes=800)
print(rewards_history)

[-462.80999999996345, -126.25000000000499, -109.25000000000722, -75.15000000000096, -93.19000000000567, -114.1500000000107, -145.17000000000405, -79.05000000000263, -23.010000000000513, -89.69000000000325, -67.64999999999777, -68.88999999999967, -55.12999999999893, -58.7899999999979, -66.3699999999989, -15.989999999999963, -39.23000000000064, -63.46999999999811, -31.07000000000135, -80.49000000000352, -29.230000000001137, -33.29000000000142, -54.30999999999837, -38.87000000000071, -58.309999999998205, -21.89000000000015, -25.350000000000666, -38.85000000000057, -45.48999999999955, -35.59000000000043, -24.89000000000072, -18.830000000000148, -29.950000000001246, -31.750000000000497, -30.490000000001228, -42.78999999999975, -13.649999999999876, -57.76999999999869, -29.550000000000853, -18.3500000000002, -27.190000000000698, -30.830000000000958, -29.350000000000843, -18.11000000000038, -12.4299999999998, -50.86999999999842, -36.17000000000025, -14.249999999999844, -32.49000000000069, -20.

In [29]:
import imageio
import os
from glob import glob

# Create a GIF from the images in the ./pic directory
def create_gif_from_images(image_folder, output_gif_path, duration=100):
    images = []
    # Sort filenames numerically by extracting the episode number
    filenames = sorted(
        glob(os.path.join(image_folder, '*.png')),
        key=lambda x: int(os.path.basename(x).split('_')[1].split('.')[0])
    )
    for filename in filenames:
        images.append(imageio.imread(filename))
    imageio.mimsave(output_gif_path, images, duration=duration)

# Create the GIF
create_gif_from_images('./pic', 'q_learning_path.gif', duration=0.3)
# The GIF will be saved as 'q_learning_path.gif' in the current directory.

  images.append(imageio.imread(filename))
