In [None]:

from gymnasium import Env
from gymnasium.spaces import Box,MultiDiscrete
import numpy as np
import os
from stable_baselines3 import PPO
import sys

In [None]:

from stable_baselines3.common.monitor import Monitor

In [None]:
import pygame


In [None]:
class Agent:
    def __init__(self, start_pos):
        self._pos = np.array(start_pos, dtype=np.int32)
        self._prev_state = None
        self._prev_prev_state = None
        self._carrying = None
        self._prev_goal_dists = []    
    

In [None]:
class Package:
    def __init__(self, pos):
        self._pos = pos
        self._delivered = False
        self._picked = False
        self._assigned = None
   
    def copy(self):
        return {
            "pos": self._pos,
            "picked": self._picked,
            "assigned_to": self._assigned,
            "delivered": self._delivered
        }
    def get_priority(self):
        return 0
    def on_pickup(self, pickup_reward):
        return pickup_reward
    def on_deliver(self, deliver_reward):
        return deliver_reward


In [None]:
class ExpressPackage(Package):
    def get_priority(self):
        return 1
    def on_pickup(self, pickup_reward):
        return pickup_reward * 1.5
    def on_deliver(self, deliver_reward):
        return deliver_reward * 1.3
    

In [None]:
class HeavyPackage(Package):
    def get_priority(self):
        return -1
    def on_pickup(self, pickup_reward):
        return pickup_reward * 1.2
    def on_deliver(self, deliver_reward):
        return deliver_reward * 1.5
    

In [None]:
def init_map():
    tok_map = [
        ['#'] * 28,
        ['#'] + ['.'] * 26 + ['#'],
        ['#', '.', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '.', '.', '.', '.', '#'],
        ['#', '.', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '.', '.', '.', '.', '#'],
        ['#'] + ['.'] * 23 + ['.']  + ['.', '.', '#'],
        ['#', '.', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '.', '.', '.', '.', '#'],
        ['#', '.', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '.', '.', '.', '.', '#'],
        ['#'] + ['.'] * 25 + ['O', '#'],
        ['#', '.', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '.', '.', '.', '.', '#'],
        ['#', '.', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '.', '.', '.', '.', '#'],
        ['#'] + ['.'] * 23 + ['.']  + ['.', '.', '#'],
        ['#', '.', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '.', '.', '.', '.', '#'],
        ['#', '.', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '@', '@', '@', '@', '.', '.', '.', '.', '.', '#'],
        ['#'] + ['.'] * 26 + ['#'],
        ['#'] * 28,
    ]

    return tok_map

In [None]:

def manhattan(a, b):
    """Return Manhattan (L1) distance between two 2-tuples/iterables (x,y)."""
    ax, ay = a
    bx, by = b
    return abs(ax - bx) + abs(ay - by)



In [None]:
## class with two agent



class WareHouseEnv(Env):
    def __init__(
        self,
        start_pos=(4, 24),
        move_length=200,
        step_cost=-0.01,
        collision_penalty=-1.0,
        pickup_reward=+20.0,
        deliver_reward=+30.0,
        completed_reward=+60.0,
        max_num_packages=4,
        agent_num=2,
        num_of_express_pkg = 1,
        num_of_heavy_pkg=1,
    ):
        #Public:
        # ACTION SPACE
        self.action_space = MultiDiscrete([4] * agent_num)
        # OBS SPACE
        low_one = np.array([
            0, 0,     # ax, ay
            0, 0,     # gx, gy
            0,        # carrying_flag
            0, 0, 0, 0,  # blocked flags
            -14, -27
        ], dtype=np.float32)
        high_one = np.array([
            14, 27,
            14, 27,
            1,
            1, 1, 1, 1,
            14, 27
        ], dtype=np.float32)

        low = np.tile(low_one, agent_num)
        high = np.tile(high_one, agent_num)
        self.observation_space = Box(low=low, high=high, dtype=np.float32)

        #Private
        self._agent_num = agent_num
        self._start_pos = np.array(start_pos, dtype=np.int32)
        self._agents = [Agent(self._start_pos.copy()  + np.random.randint(-1, 2, size=2)) for _ in range(agent_num)]

        # MAP
        self._map = init_map()
        self._obstacles = {"#", "@", "O", "%"}

        # Rewards
        self._step_cost = step_cost
        self._collision_penalty = collision_penalty
        self._pickup_reward = pickup_reward
        self._deliver_reward = deliver_reward
        self._completed_reward = completed_reward

        # Agent / env state
        self._max_move_length = move_length
        self._move_length = move_length
        self._packages = []
        self._max_num_packages = max_num_packages
        self._num_of_express_pkg = num_of_express_pkg
        self._num_of_heavy_pkg = num_of_heavy_pkg
        # Locations
        self._offload_positions = self._find_tiles("O")
        self._shelf_pos = self._find_tiles("@")
        self._drop_zone = self._offload_positions[0]
     
        # RNG
        self._np_random = np.random.RandomState()

        # spawn initial packages
        for _ in range(max_num_packages - num_of_express_pkg - num_of_heavy_pkg):
            self._spawn_goals()
        for _ in range(num_of_express_pkg):
            self._spawn_goals("express")
        for _ in range(num_of_heavy_pkg):
            self._spawn_goals("heavy")

    # ------------------------------------------------------
    def init_pygame(self):
        pygame.init()
        self._tile_size = 24
        self._colors = {
            "empty": (240, 240, 240),
            "wall": (80, 80, 80),
            "shelf": (139, 69, 19),
            "offload": (0, 120, 255),
            "normal_package": (255, 165, 0),
            "express_package": (255,0,0),
            "heavy_package": (128,128,128),
            "agent": (0, 200, 0),
        }
        self._screen = pygame.display.set_mode(
            (self._tile_size * 28, self._tile_size * 15)
        )
        pygame.display.set_caption("Warehouse Environment")

    # ------------------------------------------------------
    def _find_tiles(self, tile_char):
        out = []
        for i, row in enumerate(self._map):
            for j, c in enumerate(row):
                if c == tile_char:
                    out.append((i, j))
        return out

    # ------------------------------------------------------
    def _assign_packages(self):
        """
        Assign each free agent exactly one package.
        Deterministic tie-breaking:
          - higher priority first
          - if tie, closer distance first
          - stable agent ordering
        """

    
        free_agents = []
        for i, agent in enumerate(self._agents):
            if agent._carrying is None:
                has_assigned = any(
                    (p._assigned == i and not p._picked and not p._delivered)
                    for p in self._packages
                )
                if not has_assigned:
                    free_agents.append(i)

        if not free_agents:
            return

        free_agents.sort()    

       
        unassigned_pkgs = []
        for idx, p in enumerate(self._packages):
            if (not p._picked) and (not p._delivered) and (p._assigned is None):
                unassigned_pkgs.append(idx)

        if not unassigned_pkgs:
            return

        # For each agent in deterministic order, assign best package
        for a_idx in free_agents:
            ax, ay = self._agents[a_idx]._pos

            best = None  # (priority, -dist, p_idx)
            for p_idx in unassigned_pkgs:
                pkg = self._packages[p_idx]
                px, py = pkg._pos
                dist = abs(px - ax) + abs(py - ay)
                cand = (pkg.get_priority(), -dist, p_idx)

                if best is None or cand > best:
                    best = cand

            if best is None:
                continue

            _, _, best_idx = best
            self._packages[best_idx]._assigned = a_idx

            unassigned_pkgs.remove(best_idx)


    # ------------------------------------------------------
    def _get_current_goal(self, i):
        agent = self._agents[i]

        if agent._carrying is not None:
            return self._drop_zone
        
        for pkg in self._packages:
            if pkg._assigned == i:
                return pkg._pos

        return None

    # ------------------------------------------------------
    def _shape_reward(self, prev_state, reward, agent_index):
        goal = self._get_current_goal(agent_index)
        if goal is None:
            return reward

        gx, gy = goal
        ax, ay = self._agents[agent_index]._pos
        px, py = prev_state

        old_dist = abs(px - gx) + abs(py - gy)
        new_dist = abs(ax - gx) + abs(ay - gy)

        return reward + 0.01 * (old_dist - new_dist)

    # ------------------------------------------------------
    def _spawn_goals(self, type=None):
        if len(self._packages) < self._max_num_packages:

            existing = {tuple(p._pos) for p in self._packages if not getattr(p, "_picked", False)}

            free_shelves = [
                s for s in self._shelf_pos
                if tuple(s) not in existing
            ]

            if len(free_shelves) == 0:
                return

            pos = free_shelves[self._np_random.randint(len(free_shelves))]

            if type == "express":
                new_pkg = ExpressPackage(pos)
            elif type == "heavy":
                new_pkg = HeavyPackage(pos)
            else:
                new_pkg = Package(pos)
   
          

            self._packages.append(new_pkg)


    


    # ------------------------------------------------------
    def reset(self, seed=None, options=None):
        for i in range(self._agent_num):
            agent = self._agents[i]
                
            if seed is None:
                agent._pos = self._start_pos + np.random.randint(-1, 2, size=2)
            else:
                agent._pos = self._start_pos.copy()

            agent._carrying = None
            agent._prev_state = agent._pos.copy()
            agent._prev_prev_state = None
            agent._prev_goal_dists = []

        super().reset(seed=seed)
        if seed is not None:
            self._np_random.seed(seed)
        

        self._move_length = self._max_move_length
        self._packages = []
        for _ in range(self._max_num_packages - self._num_of_express_pkg - self._num_of_heavy_pkg):
            self._spawn_goals()
        for _ in range(self._num_of_express_pkg):
            self._spawn_goals("express")
        for _ in range(self._num_of_heavy_pkg):
            self._spawn_goals("heavy")

        return self._obs(), {"packages": [p.copy() for p in self._packages]}

    # ------------------------------------------------------
    def _obs(self):
        obs_list = []

        for i in range(self._agent_num):
            agent = self._agents[i]
            ax, ay = agent._pos

            goal = self._get_current_goal(i)
            if goal is None:
                gx, gy = ax, ay
            else:
                gx, gy = goal

            carrying_flag = 1 if agent._carrying is not None else 0

            blocked_up = 1 if (ax - 1 < 0 or self._map[ax - 1][ay] in self._obstacles) else 0
            blocked_down = 1 if (ax + 1 >= 15 or self._map[ax + 1][ay] in self._obstacles) else 0
            blocked_left = 1 if (ay - 1 < 0 or self._map[ax][ay - 1] in self._obstacles) else 0
            blocked_right = 1 if (ay + 1 >= 28 or self._map[ax][ay + 1] in self._obstacles) else 0

            other_rel = []
            for j, other in enumerate(self._agents):
                if i == j:
                    continue
                ox, oy = other._pos
                other_rel.extend([ox - ax, oy - ay])

            obs_list.extend([
                ax, ay,
                gx, gy,
                carrying_flag,
                blocked_up, blocked_down, blocked_left, blocked_right,
                *other_rel
            ])

        return np.array(obs_list, dtype=np.float32)

    # ------------------------------------------------------
    def step(self, actions):
        rewards = [0.0 for _ in range(self._agent_num)]
    
        # always update package assignments
        self._assign_packages()
    
        # Update agent history once (prev_prev <- prev <- current)
        for agent in self._agents:
            agent._prev_prev_state = None if agent._prev_state is None else agent._prev_state.copy()
            agent._prev_state = agent._pos.copy()
        
        
    
        # -------------------------
        # PHASE 1: propose movement
        # -------------------------
        proposed_positions = []
        blocked_flags = [False] * self._agent_num
    
        for i in range(self._agent_num):
            agent = self._agents[i]
            ax, ay = agent._pos
    
            a = actions[i]
    
      
            if a == 0:
                nx, ny = ax - 1, ay
            elif a == 1:
                nx, ny = ax + 1, ay
            elif a == 2:
                nx, ny = ax, ay + 1
            elif a == 3:
                nx, ny = ax, ay - 1
            else:
                nx, ny = ax, ay
    
            blocked = False
    
            if not (0 <= nx < 15 and 0 <= ny < 28) or self._map[nx][ny] in self._obstacles:
                rewards[i] += self._collision_penalty
                nx, ny = ax, ay
                blocked = True
    
            other_positions = [tuple(a._pos) for j, a in enumerate(self._agents) if j != i]
            if (nx, ny) in other_positions:
                rewards[i] += self._collision_penalty * 2.0
                nx, ny = ax, ay
                blocked = True
    
            rewards[i] += self._step_cost
    
            proposed_positions.append((nx, ny))
            blocked_flags[i] = blocked
    
    
        swap_block = [False] * self._agent_num
        for i in range(self._agent_num):
            for j in range(i + 1, self._agent_num):
                if (tuple(self._agents[i]._pos) == proposed_positions[j] and
                    tuple(self._agents[j]._pos) == proposed_positions[i]):
                    swap_block[i] = True
                    swap_block[j] = True
    
        for i in range(self.agent_num):
            if swap_block[i]:
                # penalty for collision with other agents impact learning
                # rewards[i] += self.collision_penalty * 2.0
                
                proposed_positions[i] = tuple(self._agents[i]._pos)
                blocked_flags[i] = True
    
        # -------------------------
        # PHASE 2: update positions
        # -------------------------
        for i in range(self._agent_num):
            
            self._agents[i]._pos = np.array(proposed_positions[i])
    
        # -------------------------
        # PHASE 3: package pickup
        # -------------------------
        for i in range(self._agent_num):
            agent = self._agents[i]
            if agent._carrying is not None:
                continue
    
            ax, ay = agent._pos
            closest_idx = None
            closest_dist = float("inf")
    
            for idx, pkg in enumerate(self._packages):
      
                if pkg._picked or pkg._delivered:
                    continue
    
                px, py = pkg._pos
                d = abs(px - ax) + abs(py - ay)
    
                if d <= 1 and d < closest_dist:
                    closest_dist = d
                    closest_idx = idx
    
            if closest_idx is not None:
                pkg = self._packages[closest_idx]
                pkg._picked = True
                pkg._assigned = None
                agent._carrying = pkg
                rewards[i] += pkg.on_pickup(self, self._pickup_reward)

    
        # -------------------------
        # PHASE 4: delivery
        # -------------------------
        for i in range(self._agent_num):
            agent = self._agents[i]
            if agent._carrying is None:
                continue
    
            ox, oy = self._offload_positions[0]
            if abs(agent._pos[0] - ox) + abs(agent._pos[1] - oy) <= 1:
                try:
                    agent._carrying._delivered = True
                except:
                    pass
    
                agent._carrying = None
                rewards[i] += pkg.on_deliver(self, self._deliver_reward)
    
        # -------------------------
        # PHASE 5: shaping
        # -------------------------
        for i in range(self._agent_num):
            agent = self._agents[i]
    
            rewards[i] = self._shape_reward(agent._prev_state.copy(), rewards[i], i)
    
            if agent._prev_prev_state is not None:
                reversed_move = (
                    agent._pos[0] == agent._prev_prev_state[0] and
                    agent._pos[1] == agent._prev_prev_state[1]
                )
                if reversed_move:
                    goal = self._get_current_goal(i)
                    if goal is not None:
                        gx, gy = goal
                        prev_prev_dist = abs(agent._prev_prev_state[0] - gx) + abs(agent._prev_prev_state[1] - gy)
                        new_dist = abs(agent._pos[0] - gx) + abs(agent._pos[1] - gy)
                        if new_dist >= prev_prev_dist:
                            rewards[i] -= 0.05
    
            goal = self._get_current_goal(i)
            if goal is not None:
                gx, gy = goal
                current_dist = abs(agent._pos[0] - gx) + abs(agent._pos[1] - gy)
            else:
                current_dist = 0
    
            agent._prev_goal_dists.append(current_dist)
            if len(agent._prev_goal_dists) > 10:
                agent._prev_goal_dists.pop(0)
    
            if len(agent._prev_goal_dists) >= 5 and len(set(agent._prev_goal_dists[-5:])) == 1:
                rewards[i] -= 0.1
    
            if blocked_flags[i] or (goal is not None and np.array_equal(agent._pos, agent._prev_state)):
                rewards[i] -= 0.1
    
        # -------------------------
        # termination logic
        # -------------------------
        self._move_length -= 1
    
        all_delivered = (len(self._packages) == 0) or all(getattr(p, "_delivered", False) for p in self._packages)
        terminated = all_delivered and all(a._carrying is None for a in self._agents)
        truncated = self._move_length <= 0
    
        total_reward = sum(rewards)
        return self._obs(), total_reward, terminated, truncated, {}



    # ------------------------------------------------------
    def render(self):
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                sys.exit()

        for i in range(15):
            for j in range(28):
                tile = self._map[i][j]

                if tile in {"#", "%"}:
                    color = self._colors["wall"]
                elif tile == "@":
                    color = self._colors["shelf"]
                elif tile == "O":
                    color = self._colors["offload"]
                else:
                    color = self._colors["empty"]

                pygame.draw.rect(
                    self._screen, color,
                    (j * self._tile_size, i * self._tile_size,
                     self._tile_size, self._tile_size)
                )

        for p in self._packages:
            if getattr(p, "_picked", False):
                continue  
            if isinstance(p, ExpressPackage):
                pkg_color = self._colors["express_package"]
            elif isinstance(p, HeavyPackage):
                pkg_color = self._colors["heavy_package"]
            else:
                pkg_color = self._colors["normal_package"]
            x, y = p._pos
            pygame.draw.rect(
                self._screen, pkg_color,
                (y * self._tile_size, x * self._tile_size,
                 self._tile_size, self._tile_size)
            )


        for i in range(self._agent_num):
            ax, ay = self._agents[i]._pos
            pygame.draw.rect(
                self._screen, self._colors["agent"],
                (ay * self._tile_size, ax * self._tile_size,
                 self._tile_size, self._tile_size)
            )

        pygame.display.flip()
        pygame.time.wait(40)

In [None]:
##testing the environments with random actions
env = WareHouseEnv()
env.init_pygame()
env = Monitor(env)

In [None]:
import time
import os

In [None]:

##this is for testing the env with no models and random action
episodes = 1
for episode in range(1, episodes+1):
    obs, info = env.reset()
    done = False
    score = 0

    while not done:
        # Clear console (optional, makes the render look like animation)
        os.system("cls" if os.name == "nt" else "clear")

        env.render()       # <-- render current state

        action = env.action_space.sample()
        obs, reward, terminated, truncated, info = env.step(action)

        done = terminated or truncated
        score += reward

        time.sleep(0.05)   # slow down for visibility (adjust as needed)

    print(f"Episode {episode} Score {score}")

env.close()

In [None]:
pygame.quit() #remember to run pygame.quit() after every run, or the render() will get bugged

In [None]:
#training, do not run here, load the model instead, the code is here for record keeping sake
log_path = os.path.join("Training","Logs")
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=2000000)

In [None]:
##load model 
model = PPO.load(r"C:\Users\Brian\Downloads\warehouse\Q_learn\Training\Saved Models\WareHouse_Model_multi_ver4_PPO.zip", env=env)

In [None]:
#test model with env
env = WareHouseEnv()
env.init_pygame()

In [None]:
import time
import os

In [None]:
##test out model performance

episodes = 5
for episode in range(1, episodes+1):
    obs, info = env.reset()
    done = False
    score = 0

    while not done:
        # Clear console (optional, makes the render look like animation)
        os.system("cls" if os.name == "nt" else "clear")

        env.render()       # <-- render current state

        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)

        done = terminated or truncated
        score += reward

        time.sleep(0.05)   # slow down for visibility (adjust as needed)

    print(f"Episode {episode} Score {score}")

env.close()


In [None]:
pygame.quit()