Solving Package delivery using single-agent PPO with a naive feature representation learning: concatenante all the feature in to a single state vector, and multiple robot actions as a multi discrete distribution.

In [None]:
# %%capture
# !git clone https://github.com/cuongtv312/marl-delivery.git
# %cd marl-delivery
# !pip install -r requirements.txt

In [2]:
%%capture
!pip install stable-baselines3

In [3]:
from env import Environment
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [1]:
import numpy as np
import torch
from env import Environment
from ppo_agent import PPOAgent
from tqdm import trange

In [10]:
num_episodes = 500
n_robots = 5
n_packages = 20
max_steps = 2000
map_file = "map3.txt"

# Thông số PPO
obs_dim = 454  # 3 (robot) + 5*10 (gói) + 1 (time) + 400 (map 20x20)
act_dim = 15   # 5 hướng * 3 hành động
agent = PPOAgent(obs_dim, act_dim)


In [3]:
# TODO: Modify this one to add more information to the Agents
def convert_state(state, robot_id, max_pkg=10):
    robot = state["robots"][robot_id]
    rx, ry, carrying = robot[0] / 20.0, robot[1] / 20.0, robot[2] / 100.0
    time = np.array([state["time_step"] / 1000.0], dtype=np.float32)

    map_flat = np.array(state["map"], dtype=np.float32).flatten() / 1.0

    packages = state["packages"][:max_pkg]
    pkg_feats = []
    for p in packages:
        sx, sy, dx, dy = p[1]/20.0, p[2]/20.0, p[3]/20.0, p[4]/20.0
        deadline = p[5] / 1000.0
        pkg_feats.extend([sx, sy, dx, dy, deadline])
    while len(pkg_feats) < max_pkg * 5:
        pkg_feats.append(0.0)

    return np.concatenate([[rx, ry, carrying], pkg_feats, time, map_flat])



In [4]:
# TODO: Modify this one to make the agent learn faster

def reward_shaping(r, action):
    shaped = r
    if action[1] == '1':
        shaped += 0.2  # thưởng nhặt
    elif action[1] == '2':
        shaped += 0.5  # thưởng giao
    if action[0] == 'S':
        shaped -= 0.01  # phạt đứng yên
    return shaped



In [5]:
def reward_shaping(r, action):
    shaped = r
    if action[1] == '1':
        shaped += 0.2  # thưởng nhặt
    elif action[1] == '2':
        shaped += 0.5  # thưởng giao
    if action[0] == 'S':
        shaped -= 0.01  # phạt đứng yên
    return shaped


In [6]:
# Avoid to modify the Env class,
# If it is neccessary, you should describe those changes clearly in report and code
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)


        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)
        # Define observation space as a dictionary

        self.observation_space = spaces.Box(low=0, high=100, shape=first_state.shape, dtype=np.float32)


        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

NameError: name 'gym' is not defined

In [7]:
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback


# Parallel environments
vec_env = make_vec_env(lambda: Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10), n_envs=10)
eval_env = Monitor(Env('map2.txt', 100, 5, 20, -0.01, 10., 1., 10), "ppo_delivery")

eval_callback = EvalCallback(eval_env, best_model_save_path="./best_model/",
                             log_path="./logs/", eval_freq=5000,
                             deterministic=True, render=False)

model = PPO("MlpPolicy", vec_env, verbose=1)
model.learn(total_timesteps=10000, callback=eval_callback)
model.save("ppo_delivery")


NameError: name 'Env' is not defined

In [8]:
from ppo_agent import PPOAgent
agent = PPOAgent(obs_dim=203, act_dim=15)  # sửa số chiều phù hợp

for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        actions = []
        log_probs = []
        entropies = []
        obs_list = []

        for i in range(len(state["robots"])):
            obs = convert_state(state, i)
            action, log_prob, entropy = agent.select_action(obs)
            obs_list.append(obs)
            actions.append((action // 3, str(action % 3)))  # giải mã action
            log_probs.append(log_prob)
            entropies.append(entropy)

        next_state, rewards, done, info = env.step(actions)

        for i in range(len(state["robots"])):
            next_obs = convert_state(next_state, i)
            r = rewards[i]
            agent.store(obs_list[i], actions[i][0] * 3 + int(actions[i][1]), r, next_obs, done, log_probs[i])
            total_reward += r

        state = next_state

    agent.train()
    print(f"Episode {episode} | Reward: {total_reward}")


NameError: name 'episodes' is not defined

In [9]:
obs,_ = eval_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, _, info = eval_env.step(action)
    #print('='*10)
    #eval_env.unwrapped.env.render()
    if dones:
        break

print(info)

{'total_reward': -3.299999999999999, 'total_time_steps': 100, 'episode': {'r': -3.3, 'l': 100, 't': 168.352014}}


In [10]:
!pip freeze | grep stable_baselines3

'grep' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
for ep in trange(num_episodes):
    env = Environment(map_file=map_file, max_time_steps=max_steps,
                      n_robots=n_robots, n_packages=n_packages)
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        actions = []
        log_probs = []
        obs_list = []

        for i in range(n_robots):
            obs = convert_state(state, i)
            act_flat, log_prob, _ = agent.select_action(obs)
            move = ['S', 'L', 'R', 'U', 'D'][act_flat // 3]
            act = str(act_flat % 3)
            actions.append((move, act))
            obs_list.append(obs)
            log_probs.append(log_prob)

        next_state, reward, done, infos = env.step(actions)

        for i in range(n_robots):
            shaped_reward = reward_shaping(reward, actions[i])  
            next_obs = convert_state(next_state, i)
            flat_action = ['S', 'L', 'R', 'U', 'D'].index(actions[i][0]) * 3 + int(actions[i][1])
            agent.store(obs_list[i], flat_action, shaped_reward, next_obs, done, log_probs[i])
            total_reward += shaped_reward

        state = next_state

    agent.train()
    print(f"Episode {ep} | Total reward: {total_reward:.2f}")


  obs = torch.FloatTensor(obs)
  0%|          | 1/500 [00:08<1:07:44,  8.15s/it]

Episode 0 | Total reward: 2117.28


  0%|          | 2/500 [00:15<1:05:09,  7.85s/it]

Episode 1 | Total reward: 2120.58


  1%|          | 3/500 [00:24<1:08:01,  8.21s/it]

Episode 2 | Total reward: 2133.44


  1%|          | 4/500 [00:32<1:08:42,  8.31s/it]

Episode 3 | Total reward: 2108.02


  1%|          | 5/500 [00:42<1:11:32,  8.67s/it]

Episode 4 | Total reward: 2048.57


  1%|          | 6/500 [00:51<1:12:20,  8.79s/it]

Episode 5 | Total reward: 2035.84


  1%|▏         | 7/500 [00:59<1:11:22,  8.69s/it]

Episode 6 | Total reward: 2037.35


  2%|▏         | 8/500 [01:08<1:10:25,  8.59s/it]

Episode 7 | Total reward: 2056.96


  2%|▏         | 9/500 [01:16<1:09:06,  8.45s/it]

Episode 8 | Total reward: 2011.99


  2%|▏         | 10/500 [01:24<1:08:44,  8.42s/it]

Episode 9 | Total reward: 2068.21


  2%|▏         | 11/500 [01:32<1:07:53,  8.33s/it]

Episode 10 | Total reward: 2010.59


  2%|▏         | 12/500 [01:41<1:08:11,  8.38s/it]

Episode 11 | Total reward: 2122.77


  3%|▎         | 13/500 [01:48<1:05:53,  8.12s/it]

Episode 12 | Total reward: 2115.40


  3%|▎         | 14/500 [01:56<1:04:56,  8.02s/it]

Episode 13 | Total reward: 2096.59


  3%|▎         | 15/500 [02:04<1:03:40,  7.88s/it]

Episode 14 | Total reward: 2090.88


  3%|▎         | 16/500 [02:11<1:03:22,  7.86s/it]

Episode 15 | Total reward: 2064.34


  3%|▎         | 17/500 [02:19<1:02:20,  7.74s/it]

Episode 16 | Total reward: 2071.03


  4%|▎         | 18/500 [02:26<1:01:41,  7.68s/it]

Episode 17 | Total reward: 2077.92


  4%|▍         | 19/500 [02:34<1:01:43,  7.70s/it]

Episode 18 | Total reward: 2079.03


  4%|▍         | 20/500 [02:42<1:01:04,  7.63s/it]

Episode 19 | Total reward: 2056.01


  4%|▍         | 21/500 [02:49<1:01:00,  7.64s/it]

Episode 20 | Total reward: 2029.15


  4%|▍         | 22/500 [02:57<1:00:22,  7.58s/it]

Episode 21 | Total reward: 2060.31


  5%|▍         | 23/500 [03:04<1:00:44,  7.64s/it]

Episode 22 | Total reward: 2045.58


  5%|▍         | 24/500 [03:12<1:00:24,  7.61s/it]

Episode 23 | Total reward: 2074.88


  5%|▌         | 25/500 [03:20<1:00:32,  7.65s/it]

Episode 24 | Total reward: 2059.09


  5%|▌         | 26/500 [03:27<1:00:23,  7.65s/it]

Episode 25 | Total reward: 2086.08


  5%|▌         | 27/500 [03:35<1:00:30,  7.68s/it]

Episode 26 | Total reward: 2087.61


  6%|▌         | 28/500 [03:43<59:51,  7.61s/it]  

Episode 27 | Total reward: 2028.88


  6%|▌         | 29/500 [03:50<59:21,  7.56s/it]

Episode 28 | Total reward: 2089.44


  6%|▌         | 30/500 [03:58<59:32,  7.60s/it]

Episode 29 | Total reward: 2058.58


  6%|▌         | 31/500 [04:05<59:16,  7.58s/it]

Episode 30 | Total reward: 2008.25


  6%|▋         | 32/500 [04:13<59:33,  7.64s/it]

Episode 31 | Total reward: 2033.67


  7%|▋         | 33/500 [04:21<59:23,  7.63s/it]

Episode 32 | Total reward: 2065.18


  7%|▋         | 34/500 [04:29<1:01:00,  7.85s/it]

Episode 33 | Total reward: 2062.68


  7%|▋         | 35/500 [04:37<1:01:50,  7.98s/it]

Episode 34 | Total reward: 2121.44


  7%|▋         | 36/500 [04:45<1:01:32,  7.96s/it]

Episode 35 | Total reward: 2044.79


  7%|▋         | 37/500 [04:53<1:01:00,  7.91s/it]

Episode 36 | Total reward: 2086.05


  8%|▊         | 38/500 [05:01<1:01:05,  7.93s/it]

Episode 37 | Total reward: 2155.96


  8%|▊         | 39/500 [05:09<1:00:19,  7.85s/it]

Episode 38 | Total reward: 1991.93


  8%|▊         | 40/500 [05:16<59:51,  7.81s/it]  

Episode 39 | Total reward: 2065.34


  8%|▊         | 41/500 [05:25<1:01:10,  8.00s/it]

Episode 40 | Total reward: 2064.51


  8%|▊         | 42/500 [05:32<1:00:20,  7.90s/it]

Episode 41 | Total reward: 2040.16


  9%|▊         | 43/500 [05:41<1:00:30,  7.94s/it]

Episode 42 | Total reward: 2043.60


  9%|▉         | 44/500 [05:48<59:39,  7.85s/it]  

Episode 43 | Total reward: 2081.26


  9%|▉         | 45/500 [05:56<59:35,  7.86s/it]

Episode 44 | Total reward: 2043.37


  9%|▉         | 46/500 [06:04<59:00,  7.80s/it]

Episode 45 | Total reward: 2027.02


  9%|▉         | 47/500 [06:12<59:06,  7.83s/it]

Episode 46 | Total reward: 2028.60


 10%|▉         | 48/500 [06:19<58:40,  7.79s/it]

Episode 47 | Total reward: 2121.16


 10%|▉         | 49/500 [06:27<59:00,  7.85s/it]

Episode 48 | Total reward: 2085.38


 10%|█         | 50/500 [06:35<59:12,  7.90s/it]

Episode 49 | Total reward: 2030.30


 10%|█         | 51/500 [06:43<58:37,  7.83s/it]

Episode 50 | Total reward: 2043.14


 10%|█         | 52/500 [06:51<58:48,  7.88s/it]

Episode 51 | Total reward: 2030.76


 11%|█         | 53/500 [06:59<1:00:01,  8.06s/it]

Episode 52 | Total reward: 2095.62


 11%|█         | 54/500 [07:07<59:36,  8.02s/it]  

Episode 53 | Total reward: 2010.86


 11%|█         | 55/500 [07:15<58:41,  7.91s/it]

Episode 54 | Total reward: 2056.83


 11%|█         | 56/500 [07:23<58:36,  7.92s/it]

Episode 55 | Total reward: 2042.01


 11%|█▏        | 57/500 [07:31<58:02,  7.86s/it]

Episode 56 | Total reward: 2090.46


 12%|█▏        | 58/500 [07:39<58:43,  7.97s/it]

Episode 57 | Total reward: 2078.76


 12%|█▏        | 59/500 [07:47<57:54,  7.88s/it]

Episode 58 | Total reward: 2048.12


 12%|█▏        | 60/500 [07:54<57:24,  7.83s/it]

Episode 59 | Total reward: 2034.74


 12%|█▏        | 61/500 [08:02<57:22,  7.84s/it]

Episode 60 | Total reward: 2045.96


 12%|█▏        | 62/500 [08:10<56:53,  7.79s/it]

Episode 61 | Total reward: 2084.29


 13%|█▎        | 63/500 [08:18<56:57,  7.82s/it]

Episode 62 | Total reward: 2070.06


 13%|█▎        | 64/500 [08:25<56:27,  7.77s/it]

Episode 63 | Total reward: 2045.42


 13%|█▎        | 65/500 [08:33<56:33,  7.80s/it]

Episode 64 | Total reward: 2090.40


 13%|█▎        | 66/500 [08:41<56:48,  7.85s/it]

Episode 65 | Total reward: 2062.34


 13%|█▎        | 67/500 [08:49<56:48,  7.87s/it]

Episode 66 | Total reward: 1982.62


 14%|█▎        | 68/500 [08:57<56:12,  7.81s/it]

Episode 67 | Total reward: 2054.59


 14%|█▍        | 69/500 [09:05<56:21,  7.84s/it]

Episode 68 | Total reward: 2050.97


 14%|█▍        | 70/500 [09:12<55:51,  7.79s/it]

Episode 69 | Total reward: 2046.96


 14%|█▍        | 71/500 [09:20<55:28,  7.76s/it]

Episode 70 | Total reward: 2052.47


 14%|█▍        | 72/500 [09:28<55:37,  7.80s/it]

Episode 71 | Total reward: 2081.06


 15%|█▍        | 73/500 [09:36<55:15,  7.77s/it]

Episode 72 | Total reward: 2061.12


 15%|█▍        | 74/500 [09:44<55:51,  7.87s/it]

Episode 73 | Total reward: 2043.74


 15%|█▌        | 75/500 [09:52<55:40,  7.86s/it]

Episode 74 | Total reward: 2034.44


 15%|█▌        | 76/500 [09:59<55:32,  7.86s/it]

Episode 75 | Total reward: 2028.64


 15%|█▌        | 77/500 [10:07<55:08,  7.82s/it]

Episode 76 | Total reward: 2092.39


 16%|█▌        | 78/500 [10:15<55:12,  7.85s/it]

Episode 77 | Total reward: 2042.67


 16%|█▌        | 79/500 [10:23<54:40,  7.79s/it]

Episode 78 | Total reward: 2067.91


 16%|█▌        | 80/500 [10:31<54:43,  7.82s/it]

Episode 79 | Total reward: 2083.85


 16%|█▌        | 81/500 [10:38<54:15,  7.77s/it]

Episode 80 | Total reward: 2047.05


 16%|█▋        | 82/500 [10:46<54:28,  7.82s/it]

Episode 81 | Total reward: 2045.19


 17%|█▋        | 83/500 [10:54<54:27,  7.84s/it]

Episode 82 | Total reward: 2107.82


 17%|█▋        | 84/500 [11:02<54:26,  7.85s/it]

Episode 83 | Total reward: 2020.75


 17%|█▋        | 85/500 [11:10<54:24,  7.87s/it]

Episode 84 | Total reward: 2035.33


 17%|█▋        | 86/500 [11:18<53:48,  7.80s/it]

Episode 85 | Total reward: 2140.79


 17%|█▋        | 87/500 [11:25<53:51,  7.82s/it]

Episode 86 | Total reward: 2076.85


 18%|█▊        | 88/500 [11:33<53:32,  7.80s/it]

Episode 87 | Total reward: 2039.52


 18%|█▊        | 89/500 [11:41<53:45,  7.85s/it]

Episode 88 | Total reward: 2096.29


 18%|█▊        | 90/500 [11:49<53:04,  7.77s/it]

Episode 89 | Total reward: 2069.02


 18%|█▊        | 91/500 [11:57<53:52,  7.90s/it]

Episode 90 | Total reward: 2011.56


 18%|█▊        | 92/500 [12:05<53:20,  7.84s/it]

Episode 91 | Total reward: 2132.56


 19%|█▊        | 93/500 [12:12<52:52,  7.79s/it]

Episode 92 | Total reward: 2026.53


 19%|█▉        | 94/500 [12:20<53:00,  7.83s/it]

Episode 93 | Total reward: 2054.57


 19%|█▉        | 95/500 [12:28<52:30,  7.78s/it]

Episode 94 | Total reward: 2073.09


 19%|█▉        | 96/500 [12:36<52:38,  7.82s/it]

Episode 95 | Total reward: 2055.30


 19%|█▉        | 97/500 [12:43<52:07,  7.76s/it]

Episode 96 | Total reward: 2066.56


 20%|█▉        | 98/500 [12:51<52:16,  7.80s/it]

Episode 97 | Total reward: 2074.36


 20%|█▉        | 99/500 [12:59<51:52,  7.76s/it]

Episode 98 | Total reward: 2048.83


 20%|██        | 100/500 [13:07<52:56,  7.94s/it]

Episode 99 | Total reward: 2069.45


 20%|██        | 101/500 [13:15<52:31,  7.90s/it]

Episode 100 | Total reward: 2114.54


 20%|██        | 102/500 [13:23<53:01,  7.99s/it]

Episode 101 | Total reward: 2058.49


 21%|██        | 103/500 [13:31<52:24,  7.92s/it]

Episode 102 | Total reward: 2053.29


 21%|██        | 104/500 [13:39<51:47,  7.85s/it]

Episode 103 | Total reward: 2010.53


 21%|██        | 105/500 [13:47<51:50,  7.87s/it]

Episode 104 | Total reward: 2058.02


 21%|██        | 106/500 [13:55<51:38,  7.86s/it]

Episode 105 | Total reward: 2071.29


 21%|██▏       | 107/500 [14:03<52:06,  7.96s/it]

Episode 106 | Total reward: 2037.91


 22%|██▏       | 108/500 [14:11<52:45,  8.07s/it]

Episode 107 | Total reward: 2059.95


 22%|██▏       | 109/500 [14:19<53:15,  8.17s/it]

Episode 108 | Total reward: 2028.07


 22%|██▏       | 110/500 [14:27<52:44,  8.11s/it]

Episode 109 | Total reward: 2034.41


 22%|██▏       | 111/500 [14:36<53:08,  8.20s/it]

Episode 110 | Total reward: 2067.93


 22%|██▏       | 112/500 [14:44<52:32,  8.13s/it]

Episode 111 | Total reward: 2031.11


 23%|██▎       | 113/500 [14:52<52:06,  8.08s/it]

Episode 112 | Total reward: 2041.67


 23%|██▎       | 114/500 [15:00<52:03,  8.09s/it]

Episode 113 | Total reward: 2075.67


 23%|██▎       | 115/500 [15:09<52:56,  8.25s/it]

Episode 114 | Total reward: 2028.06


 23%|██▎       | 116/500 [15:17<53:20,  8.33s/it]

Episode 115 | Total reward: 2103.68


 23%|██▎       | 117/500 [15:25<52:40,  8.25s/it]

Episode 116 | Total reward: 2028.09


 24%|██▎       | 118/500 [15:34<53:24,  8.39s/it]

Episode 117 | Total reward: 2067.95


 24%|██▍       | 119/500 [15:43<54:41,  8.61s/it]

Episode 118 | Total reward: 2135.44


 24%|██▍       | 120/500 [15:52<56:09,  8.87s/it]

Episode 119 | Total reward: 2065.75


 24%|██▍       | 121/500 [16:02<57:11,  9.05s/it]

Episode 120 | Total reward: 2051.64


 24%|██▍       | 122/500 [16:11<58:02,  9.21s/it]

Episode 121 | Total reward: 2029.77


 25%|██▍       | 123/500 [16:22<59:26,  9.46s/it]

Episode 122 | Total reward: 2043.25


 25%|██▍       | 124/500 [16:30<57:34,  9.19s/it]

Episode 123 | Total reward: 2032.04


 25%|██▌       | 125/500 [16:38<55:40,  8.91s/it]

Episode 124 | Total reward: 2060.20


 25%|██▌       | 126/500 [16:46<53:51,  8.64s/it]

Episode 125 | Total reward: 2061.70


 25%|██▌       | 127/500 [16:55<52:59,  8.52s/it]

Episode 126 | Total reward: 2126.33
