In [1]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm


from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv, Overcooked
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
from overcooked_ai_py.visualization.state_visualizer import StateVisualizer
from collections import namedtuple
import pygame
import random
from collections import deque


In [2]:
class OvercookedRewardShaping(Overcooked): # Using OvercookedGridworld if it's the base
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.prev_agent_obs = [None, None] # To store observations from previous step for both agents

    def step(self, actions):
        observation, base_reward, done, info = super().step(actions)

        # Calculate shaped reward for each agent
        shaped_reward_total = 0
        current_agent_obs = observation['both_agent_obs']

        # Ensure prev_agent_obs is initialized for the first step
        if self.prev_agent_obs[0] is None:
            self.prev_agent_obs = current_agent_obs

        for i, obs in enumerate(current_agent_obs):
            # Pass current and previous observation for this agent
            shaped_reward_total += self._compute_agent_shaping(obs, self.prev_agent_obs[i])

        # Update previous observations for the next step
        self.prev_agent_obs = current_agent_obs

        final_shaped_reward = base_reward + shaped_reward_total

        #Optional: Print base reward only if it's non-zero for clarity
        if base_reward != 0:
             print(f"Soup delivered!")

        return observation, base_reward,final_shaped_reward, done, info

    def _compute_agent_shaping(self, current_obs, prev_obs):
        shaping = 0.0

        ONION_IDX = 0
        SOUP_IDX = 1
        DISH_IDX = 2
        TOMATO_IDX = 3


        POT_EMPTY_IDX = 0
        POT_FULL_IDX = 1
        POT_COOKING_IDX = 2
        POT_READY_IDX = 3


        prev_holding_vector = prev_obs[4:8]
        current_holding_vector = current_obs[4:8]

        # Was not holding anything, now holding an ingredient or empty dish
        if prev_holding_vector.sum() == 0 and current_holding_vector.sum() == 1:
            if current_holding_vector[ONION_IDX] == 1 or current_holding_vector[TOMATO_IDX] == 1:
                shaping += 0.05 # Reward for picking up an ingredient
            elif current_holding_vector[DISH_IDX] == 1:
                shaping += 0.02 # Reward for picking up an empty dish

        prev_pot_onions = prev_obs[27:28][0] # Assuming single value
        current_pot_onions = current_obs[27:28][0]
        prev_pot_tomatoes = prev_obs[28:29][0]
        current_pot_tomatoes = current_obs[28:29][0]

        # Check if ingredient count in closest pot increased
        if current_pot_onions > prev_pot_onions:
            shaping += 0.1 # Reward for adding onion
        if current_pot_tomatoes > prev_pot_tomatoes:
            shaping += 0.1 # Reward for adding tomato



        prev_pot_states = prev_obs[23:27]
        current_pot_states = current_obs[23:27]

        # Transition from full to cooking
        if prev_pot_states[POT_FULL_IDX] == 1 and current_pot_states[POT_COOKING_IDX] == 1:
            shaping += 0.2 # Reward for starting to cook

        # Transition from cooking to ready
        if prev_pot_states[POT_COOKING_IDX] == 1 and current_pot_states[POT_READY_IDX] == 1:
            shaping += 0.3 # Reward for soup becoming ready


        if prev_pot_states[POT_READY_IDX] == 1 and current_holding_vector[SOUP_IDX] == 1 and prev_holding_vector[SOUP_IDX] == 0:
            shaping += 0.25 # Reward for picking up a ready soup


        current_dx_serving = abs(current_obs[16:17][0]) # abs(dx)
        current_dy_serving = abs(current_obs[17:18][0]) # abs(dy)
        prev_dx_serving = abs(prev_obs[16:17][0])
        prev_dy_serving = abs(prev_obs[17:18][0])

        current_dist_serving = current_dx_serving + current_dy_serving # Manhattan distance
        prev_dist_serving = prev_dx_serving + prev_dy_serving

        if current_holding_vector[SOUP_IDX] == 1 and current_dist_serving < prev_dist_serving:
            shaping += 0.01 # Small continuous reward for moving towards serving



        return shaping

**Initialization**

In [29]:
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)



class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer4 = nn.Linear(64, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer4(x)

class OvercookedDQN:
    def __init__(
            self,
            layout_name,
            model_DQN,
            gamma,  # Discount factor
            lr_model,
            epochs,  # Number of optimization epochs
            batch_size,
            optimizer_class,
            epsilon_decay,
            epsilon_start,
            epsilon_end,
            TAU):

        self.gamma = gamma
        self.lr_model = lr_model
        self.epochs = epochs
        self.batch_size = batch_size
        self.TAU = TAU
        self.epsilon_decay = epsilon_decay
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        base_mdp = OvercookedGridworld.from_layout_name(layout_name)  # or other layout
        base_env = OvercookedEnv.from_mdp(base_mdp, info_level=0, horizon=400)
        #self.env = env = Overcooked(base_env=base_env, featurize_fn=base_env.featurize_state_mdp)
        self.env  = OvercookedRewardShaping(base_env=base_env, featurize_fn=base_env.featurize_state_mdp)

        self.n_possible_action = self.env.action_space.n

        dummy_state = self.env.reset()
        dummy_obs_agent0 = dummy_state['both_agent_obs'][0]
        state_input_size = len(dummy_obs_agent0)  # Assuming the featurized state is a flat vector

        print(state_input_size)

        self.policy_net = model_DQN(state_input_size, self.n_possible_action).to(self.device)
        self.target_net = DQN(state_input_size, self.n_possible_action).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())

        self.model_optimizer = optimizer_class(self.policy_net.parameters(), lr=lr_model)

        self.memory = ReplayMemory(100000)
        self.steps_done = 0
        self.episode_durations = []

    def select_action(self,state):
        sample = random.random()
        eps_threshold = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            math.exp(-1. * self.steps_done / self.epsilon_decay)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                # t.max(1) will return the largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                return self.policy_net(state).max(1).indices.view(1, 1)
        else:
            return torch.tensor([[self.env.action_space.sample()]], device=self.device, dtype=torch.long)


    def optimize_model(self,agent):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        agent_states = [s[agent] for s in batch.state]
        agent_actions = [a[agent] for a in batch.action]
        agent_next_states = [ns[agent] for ns in batch.next_state]


        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              agent_next_states)), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in agent_next_states
                                                    if s is not None])
        state_batch = torch.cat(agent_states)
        action_batch = torch.cat(agent_actions)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1).values
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        with torch.no_grad():
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1).values
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        # Compute Huber loss
        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.model_optimizer.zero_grad()
        loss.backward()
        # In-place gradient clipping
        torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100)
        self.model_optimizer.step()
        return loss.item()



    def trainingLoop(self, max_episodes):
        for i_episode in range(max_episodes):
            rewards = 0
            # Initialize the environment and get its state
            state = self.env.reset()
            state0 = torch.tensor(state['both_agent_obs'][0], dtype=torch.float32, device=self.device).unsqueeze(0)
            state1 = torch.tensor(state['both_agent_obs'][1], dtype=torch.float32, device=self.device).unsqueeze(0)
            for t in range(2048):
                action0 = self.select_action(state0)
                action1 = self.select_action(state1)
                obs, reward, shaped_reward, done, info = self.env.step((action0.item(),action1.item()))
                done = done

                reward += shaped_reward

                if (state['both_agent_obs'][0] == obs['both_agent_obs'][0]).all() or (state['both_agent_obs'][1] == obs['both_agent_obs'][1]).all():
                    reward += -0.1 * t

                reward = torch.tensor([reward], device=self.device)

                if done:
                    next_state0 = None
                    next_state1 = None
                else:
                    next_state0 = torch.tensor(obs['both_agent_obs'][0], dtype=torch.float32, device=self.device).unsqueeze(0)
                    next_state1 = torch.tensor(obs['both_agent_obs'][1], dtype=torch.float32, device=self.device).unsqueeze(0)

                # Store the transition in memory
                rewards = rewards + reward.item()
                self.memory.push((state0, state1), (action0,action1), (next_state0,next_state1), reward)

                # Move to the next state
                state0 = next_state0
                state1 = next_state1

                # Perform one step of the optimization (on the policy network)
                loss = self.optimize_model(0)
                loss = self.optimize_model(1)

                # Soft update of the target network's weights
                # θ′ ← τ θ + (1 −τ )θ′
                target_net_state_dict = self.target_net.state_dict()
                policy_net_state_dict = self.policy_net.state_dict()
                for key in policy_net_state_dict:
                    target_net_state_dict[key] = policy_net_state_dict[key]*self.TAU + target_net_state_dict[key]*(1-self.TAU)
                self.target_net.load_state_dict(target_net_state_dict)

                if done:
                    self.episode_durations.append(t + 1)
                    if i_episode % 10 == 0:
                        print(f"Episode: {i_episode + 1}, DQN loss : {loss}, total reward {rewards}")


                    break

        print('Complete')

    def testVisualize(self,print_action=False):
        pygame.init()
        visualizer = StateVisualizer()

        # 2) Grab your grid and do one dummy render to get a surface
        grid = self.env.base_env.mdp.terrain_mtx
        _ = self.env.reset()
        surf = visualizer.render_state(self.env.base_env.state, grid=grid)

        # 3) Use that surface’s size for your window
        win_w, win_h = surf.get_size()
        screen = pygame.display.set_mode((win_w, win_h), pygame.RESIZABLE)
        clock = pygame.time.Clock()

        running = True
        obs = self.env.reset()  #observation of the starting state
        soup_delivered = 0

        total_rewards = []
        while running:
            for ev in pygame.event.get():
                if ev.type == pygame.QUIT:
                    running = False

            self.policy_net.eval()  # Set model to evaluation mode
            with torch.no_grad():
                if print_action:
                    print(f"osservazione iniziale: {obs['both_agent_obs'][0]}")
                state0_tensor = torch.tensor(obs['both_agent_obs'][0], dtype=torch.float32).unsqueeze(0).to(self.device)
                state1_tensor = torch.tensor(obs['both_agent_obs'][1], dtype=torch.float32).unsqueeze(0).to(self.device)

                logits0 = self.policy_net(state0_tensor)
                logits1 = self.policy_net(state1_tensor)

                action0 = torch.argmax(logits0, dim=1).item()
                action1 = torch.argmax(logits1, dim=1).item()

                if print_action:
                    print(action0, action1)
                # try to step; if episode is over, catch and reset
                try:
                    # Overcooked wrapper returns (obs_p0, obs_p1, reward, done, info)
                    obs, reward,shaped_reward, done, info = self.env.step((action0, action1))

                    if(reward):
                        soup_delivered += 1


                except AssertionError:
                    # base_env.is_done() was True → reset and continue
                    self.env.reset()
                    break

                # render the new state
                surf = visualizer.render_state(self.env.base_env.state, grid=grid)

                # draw it
                screen.blit(surf, (0, 0))
                pygame.display.flip()

                clock.tick(15)  # cap at 30 FPS

        pygame.quit()

        print(f"Soup delivered: {soup_delivered}")


# Test the trained agent
def test_agent(agente, num_episodes=1000):
    """Test agent performance without learning or exploration."""
    total_rewards = []
    average_rewards = []

    for episode in tqdm(range(num_episodes)):
        obs = agente.env.reset()
        episode_reward = 0
        done = False
        i = 0

        while not done:

            state0_tensor = torch.tensor(obs['both_agent_obs'][0], dtype=torch.float32).unsqueeze(0).to(agente.device)
            state1_tensor = torch.tensor(obs['both_agent_obs'][1], dtype=torch.float32).unsqueeze(0).to(agente.device)

            logits0 = agente.policy_net(state0_tensor)
            logits1 = agente.policy_net(state1_tensor)

            action0 = torch.argmax(logits0, dim=1).item()
            action1 = torch.argmax(logits1, dim=1).item()

            obs, reward,shaped_reward, done, info = agente.env.step((action0, action1))

            episode_reward += reward
            i = i+1

        average_rewards.append(episode_reward/i)
        total_rewards.append(episode_reward)
        print(f"average reward in episode {episode} : {episode_reward/i}")


    average_reward = np.mean(total_rewards)

    print(f"Test Results over {num_episodes} episodes:")
    print(f"Average Reward: {average_reward:.3f}")
    print(f"Standard Deviation: {np.std(total_rewards):.3f}")




In [30]:
DQNO = OvercookedDQN("cramped_room",model_DQN =DQN,gamma = 0.99,lr_model = 0.001,epochs = 10,batch_size = 128,optimizer_class = torch.optim.Adam,TAU=0.005,epsilon_decay=50000,epsilon_start=0.9,epsilon_end=0.05)

Using device: cuda
96


In [31]:
DQNO.trainingLoop(max_episodes=400)

Episode: 1, DQN loss : 6.794896034989506e-05, total reward 9.040000103414059
Episode: 11, DQN loss : 0.04972268268465996, total reward 9.770000156015158
Soup delivered!
Soup delivered!
Episode: 21, DQN loss : 0.2296043336391449, total reward 8.970000121742487
Episode: 31, DQN loss : 0.2349710911512375, total reward 33.45000047981739
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Episode: 41, DQN loss : 0.36925172805786133, total reward 68.32000045105815
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Episode: 51, DQN loss : 0.42138397693634033, total reward 84.29000050574541
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
S

In [34]:
test_agent(DQNO,num_episodes=10)

  0%|          | 0/10 [00:00<?, ?it/s]

Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!


 10%|█         | 1/10 [00:01<00:09,  1.10s/it]

Soup delivered!
Soup delivered!
average reward in episode 0 : 0.55
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!


 20%|██        | 2/10 [00:02<00:08,  1.03s/it]

Soup delivered!
Soup delivered!
average reward in episode 1 : 0.55
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!


 30%|███       | 3/10 [00:03<00:07,  1.01s/it]

Soup delivered!
average reward in episode 2 : 0.55
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
average reward in episode 3 : 0.55


 40%|████      | 4/10 [00:04<00:05,  1.02it/s]

Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!


 50%|█████     | 5/10 [00:04<00:04,  1.02it/s]

Soup delivered!
Soup delivered!
average reward in episode 4 : 0.55
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!


 60%|██████    | 6/10 [00:05<00:03,  1.03it/s]

Soup delivered!
average reward in episode 5 : 0.55
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!


 70%|███████   | 7/10 [00:07<00:03,  1.00s/it]

Soup delivered!
average reward in episode 6 : 0.55
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!


 80%|████████  | 8/10 [00:07<00:01,  1.00it/s]

average reward in episode 7 : 0.55
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
average reward in episode 8 : 0.55


 90%|█████████ | 9/10 [00:08<00:00,  1.02it/s]

Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!


100%|██████████| 10/10 [00:09<00:00,  1.01it/s]

Soup delivered!
Soup delivered!
average reward in episode 9 : 0.55
Test Results over 10 episodes:
Average Reward: 220.000
Standard Deviation: 0.000





In [35]:
DQNO.testVisualize()

Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered: 11
