In [1]:
!git clone https://github.com/HumanCompatibleAI/overcooked_ai.git
!gdown --id 1APcOaFTbxT6_JrCMeoqxZMECWEEILoSb

Cloning into 'overcooked_ai'...
remote: Enumerating objects: 7950, done.[K
remote: Total 7950 (delta 0), reused 0 (delta 0), pack-reused 7950 (from 1)[K
Receiving objects: 100% (7950/7950), 524.63 MiB | 38.11 MiB/s, done.
Resolving deltas: 100% (4549/4549), done.
Updating files: 100% (402/402), done.
Downloading...
From: https://drive.google.com/uc?id=1APcOaFTbxT6_JrCMeoqxZMECWEEILoSb
To: /kaggle/working/DQNPolicy_net.pth
100%|██████████████████████████████████████| 88.1k/88.1k [00:00<00:00, 91.7MB/s]


In [2]:
!uv sync --project overcooked_ai/

Using CPython 3.10.12 interpreter at: [36m/usr/bin/python3.10[39m
Creating virtual environment at: [36movercooked_ai/.venv[39m
[2K[2mResolved [1m137 packages[0m [2min 4.01s[0m[0m                                       [0m
[2K   [36m[1mBuilding[0m[39m overcooked-ai[2m @ file:///kaggle/working/overcooked_ai[0m       
[2K[1A   [36m[1mBuilding[0m[39m overcooked-ai[2m @ file:///kaggle/working/overcooked_ai[0m[1A
[37m⠙[0m [2mPreparing packages...[0m (0/32)
[2K[2A   [36m[1mBuilding[0m[39m overcooked-ai[2m @ file:///kaggle/working/overcooked_ai[0m       [2A
[37m⠙[0m [2mPreparing packages...[0m (0/32)
[2K[2A   [36m[1mBuilding[0m[39m overcooked-ai[2m @ file:///kaggle/working/overcooked_ai[0m       [2A
[37m⠙[0m [2mPreparing packages...[0m (0/32)
[2mptyprocess[0m [32m[2m------------------------------[0m[0m     0 B/13.67 KiB
[2K[3A   [36m[1mBuilding[0m[39m overcooked-ai[2m @ file:///kaggle/working/overcooked_ai[0m       [3A
[37

In [3]:
import sys
sys.path.append("/kaggle/working/overcooked_ai/src")

In [4]:
import numpy as np
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm


from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv, Overcooked
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
from overcooked_ai_py.visualization.state_visualizer import StateVisualizer
from collections import namedtuple
import pygame
import random
from collections import deque


In [5]:
class OvercookedRewardShaping(Overcooked):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.prev_agent_obs = [None, None] 

    def step(self, actions):
        observation, base_reward, done, info = super().step(actions)

        shaped_reward_total = 0
        current_agent_obs = observation['both_agent_obs']

        if self.prev_agent_obs[0] is None:
            self.prev_agent_obs = current_agent_obs

        for i, obs in enumerate(current_agent_obs):
            shaped_reward_total += self._compute_agent_shaping(obs, self.prev_agent_obs[i])

        self.prev_agent_obs = current_agent_obs

        if base_reward != 0:
             print(f"Soup delivered!")

        return observation, base_reward,shaped_reward_total, done, info

    def _compute_agent_shaping(self, current_obs, prev_obs):
        shaping = 0.0

        onion_idx = 0
        soup_idx = 1
        dish_idx = 2
        tomato_idx = 3

        pot_full_idx = 1
        pot_cooking_idx = 2
        pot_ready_idx = 3

        prev_holding_vector = prev_obs[4:8]
        current_holding_vector = current_obs[4:8]

        if prev_holding_vector.sum() == 0 and current_holding_vector.sum() == 1:
            if current_holding_vector[onion_idx] == 1 or current_holding_vector[tomato_idx] == 1:
                shaping += 0.05 
            elif current_holding_vector[dish_idx] == 1:
                shaping += 0.02 

        prev_pot_onions = prev_obs[27:28][0] 
        current_pot_onions = current_obs[27:28][0]
        prev_pot_tomatoes = prev_obs[28:29][0]
        current_pot_tomatoes = current_obs[28:29][0]

        if current_pot_onions > prev_pot_onions:
            shaping += 0.1 
        if current_pot_tomatoes > prev_pot_tomatoes:
            shaping += 0.1 

        prev_pot_states = prev_obs[23:27]
        current_pot_states = current_obs[23:27]

        if prev_pot_states[pot_full_idx] == 1 and current_pot_states[pot_cooking_idx] == 1:
            shaping += 0.2 

        if prev_pot_states[pot_cooking_idx] == 1 and current_pot_states[pot_ready_idx] == 1:
            shaping += 0.3

        if prev_pot_states[pot_ready_idx] == 1 and current_holding_vector[soup_idx] == 1 and prev_holding_vector[soup_idx] == 0:
            shaping += 0.25 


        current_dx_serving = abs(current_obs[16:17][0])
        current_dy_serving = abs(current_obs[17:18][0]) 
        prev_dx_serving = abs(prev_obs[16:17][0])
        prev_dy_serving = abs(prev_obs[17:18][0])

        current_dist_serving = current_dx_serving + current_dy_serving 
        prev_dist_serving = prev_dx_serving + prev_dy_serving

        if current_holding_vector[soup_idx] == 1 and current_dist_serving < prev_dist_serving:
            shaping += 0.01

        return shaping

**Initialization**

In [6]:
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)



class DQN(nn.Module):

    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 64)
        self.layer3 = nn.Linear(64, n_actions)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

class OvercookedDQN:
    def __init__(
            self,
            layout_names,
            model_DQN,
            gamma,  
            lr_model,
            epochs,  
            batch_size,
            optimizer_class,
            epsilon_decay,
            epsilon_start,
            epsilon_end,
            TAU):

        self.gamma = gamma
        self.lr_model = lr_model
        self.epochs = epochs
        self.batch_size = batch_size
        self.TAU = TAU
        self.epsilon_decay = epsilon_decay
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")
        self.envs = []

        for layout in layout_names:
            base_mdp = OvercookedGridworld.from_layout_name(layout)  
            base_env = OvercookedEnv.from_mdp(base_mdp, info_level=0, horizon=500)
            self.envs.append(OvercookedRewardShaping(base_env=base_env, featurize_fn=base_env.featurize_state_mdp))

        self.env_number = 0
        self.env = self.envs[self.env_number]
        self.n_possible_action = self.env.action_space.n

        dummy_state = self.env.reset()
        dummy_obs_agent0 = dummy_state['both_agent_obs'][0]
        state_input_size = len(dummy_obs_agent0)  

        print(state_input_size)

        self.policy_net = model_DQN(state_input_size, self.n_possible_action).to(self.device)
        self.target_net = DQN(state_input_size, self.n_possible_action).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())

        self.model_optimizer = optimizer_class(self.policy_net.parameters(), lr=lr_model)

        self.memory = ReplayMemory(100000)
        self.steps_done = 0
        self.episode_durations = []

    def next_env(self):
        if self.env_number == len(self.envs) -1:
            return False
        else:
            self.env_number =+ 1
            self.env = self.envs[self.env_number]
            self.steps_done = 0
            return True
        
        
    
    def select_action(self,state):
        sample = random.random()
        eps_threshold = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * \
            math.exp(-1. * self.steps_done / self.epsilon_decay)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                return self.policy_net(state).max(1).indices.view(1, 1)
        else:
            return torch.tensor([[self.env.action_space.sample()]], device=self.device, dtype=torch.long)


    def optimize_model(self,agent):
        if len(self.memory) < self.batch_size:
            return
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        agent_states = [s[agent] for s in batch.state]
        agent_actions = [a[agent] for a in batch.action]
        agent_next_states = [ns[agent] for ns in batch.next_state]

        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              agent_next_states)), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in agent_next_states
                                                    if s is not None])
        state_batch = torch.cat(agent_states)
        action_batch = torch.cat(agent_actions)
        reward_batch = torch.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
        
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        with torch.no_grad():
            next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1).values
        expected_state_action_values = (next_state_values * self.gamma) + reward_batch

        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

        self.model_optimizer.zero_grad()
        loss.backward()

        torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100)
        self.model_optimizer.step()
        return loss.item()



    def trainingLoop(self, max_episodes):
        changed = True
        for i_episode in range(max_episodes):
            if not changed:
                break
            
            soup_delivered = 0
            rewards = 0
            state = self.env.reset()
            state0 = torch.tensor(state['both_agent_obs'][0], dtype=torch.float32, device=self.device).unsqueeze(0)
            state1 = torch.tensor(state['both_agent_obs'][1], dtype=torch.float32, device=self.device).unsqueeze(0)
            for t in range(2048):
                reward = 0
                action0 = self.select_action(state0)
                action1 = self.select_action(state1)
                obs, reward, shaped_reward, done, info = self.env.step((action0.item(),action1.item()))

                if reward != 0:
                    soup_delivered += 1

                reward += max(0,shaped_reward)
                
                if (state['both_agent_obs'][0] == obs['both_agent_obs'][0]).all() or (state['both_agent_obs'][1] == obs['both_agent_obs'][1]).all():
                    reward += -0.1 * t

                reward = torch.tensor([reward], device=self.device)

                if done:
                    next_state0 = None
                    next_state1 = None
                else:
                    next_state0 = torch.tensor(obs['both_agent_obs'][0], dtype=torch.float32, device=self.device).unsqueeze(0)
                    next_state1 = torch.tensor(obs['both_agent_obs'][1], dtype=torch.float32, device=self.device).unsqueeze(0)

                rewards = rewards + reward.item()
                self.memory.push((state0, state1), (action0,action1), (next_state0,next_state1), reward)

                state0 = next_state0
                state1 = next_state1

                loss = self.optimize_model(0)
                loss = self.optimize_model(1)

                target_net_state_dict = self.target_net.state_dict()
                policy_net_state_dict = self.policy_net.state_dict()
                for key in policy_net_state_dict:
                    target_net_state_dict[key] = policy_net_state_dict[key]*self.TAU + target_net_state_dict[key]*(1-self.TAU)
                self.target_net.load_state_dict(target_net_state_dict)

                if done:
                    if soup_delivered > 100:
                        changed = self.next_env()
                        if changed :
                            print("enviroment changed")
                        else:
                            print("last enviroment reached")
                    self.episode_durations.append(t + 1)
                    if i_episode % 10 == 0:
                        print(f"Episode: {i_episode + 1}, DQN loss : {loss}, total reward {rewards}")


                    break

        print('Complete')


    def SaveModel(self,Path = "DQNPolicy_net.pth"):        
        # Save the entire model
        torch.save(self.policy_net, Path)
        
    def LoadModel(self,Path = "DQNPolicy_net.pth"):
        
        # Load the entire model
        loaded_model = torch.load(Path,weights_only=False)
        loaded_model.eval()
        self.policy_net = loaded_model


    def testVisualize(self,print_action=False):
        pygame.init()
        visualizer = StateVisualizer()

        # 2) Grab your grid and do one dummy render to get a surface
        grid = self.env.base_env.mdp.terrain_mtx
        _ = self.env.reset()
        surf = visualizer.render_state(self.env.base_env.state, grid=grid)

        # 3) Use that surface’s size for your window
        win_w, win_h = surf.get_size()
        screen = pygame.display.set_mode((win_w, win_h), pygame.RESIZABLE)
        clock = pygame.time.Clock()

        running = True
        obs = self.env.reset()  #observation of the starting state
        soup_delivered = 0

        total_rewards = []
        while running:
            for ev in pygame.event.get():
                if ev.type == pygame.QUIT:
                    running = False

            self.policy_net.eval()  # Set model to evaluation mode
            with torch.no_grad():
                if print_action:
                    print(f"osservazione iniziale: {obs['both_agent_obs'][0]}")
                state0_tensor = torch.tensor(obs['both_agent_obs'][0], dtype=torch.float32).unsqueeze(0).to(self.device)
                state1_tensor = torch.tensor(obs['both_agent_obs'][1], dtype=torch.float32).unsqueeze(0).to(self.device)

                logits0 = self.policy_net(state0_tensor)
                logits1 = self.policy_net(state1_tensor)

                action0 = torch.argmax(logits0, dim=1).item()
                action1 = torch.argmax(logits1, dim=1).item()

                if print_action:
                    print(action0, action1)
                # try to step; if episode is over, catch and reset
                try:
                    # Overcooked wrapper returns (obs_p0, obs_p1, reward, done, info)
                    obs, reward,shaped_reward, done, info = self.env.step((action0, action1))

                    if(reward):
                        soup_delivered += 1


                except AssertionError:
                    # base_env.is_done() was True → reset and continue
                    self.env.reset()
                    break

                # render the new state
                surf = visualizer.render_state(self.env.base_env.state, grid=grid)

                # draw it
                screen.blit(surf, (0, 0))
                pygame.display.flip()

                clock.tick(15)  # cap at 30 FPS

        pygame.quit()

        print(f"Soup delivered: {soup_delivered}")

In [7]:
# Test the trained agent
def test_agent(agente,enviroments, num_episodes=2000):

    envs = []

    for layout in enviroments:
        base_mdp = OvercookedGridworld.from_layout_name(layout)  
        base_env = OvercookedEnv.from_mdp(base_mdp, info_level=0, horizon=500)
        envs.append((OvercookedRewardShaping(base_env=base_env, featurize_fn=base_env.featurize_state_mdp),layout))
        
    for env,layout in envs:
        print(f"testing for enviroment :{layout}")

        total_rewards = []
        average_rewards = []
    
        for episode in range(num_episodes):
            obs = env.reset()
            episode_reward = 0
            done = False
            i = 0
    
            while not done:
    
                state0_tensor = torch.tensor(obs['both_agent_obs'][0], dtype=torch.float32).unsqueeze(0).to(agente.device)
                state1_tensor = torch.tensor(obs['both_agent_obs'][1], dtype=torch.float32).unsqueeze(0).to(agente.device)
    
                logits0 = agente.policy_net(state0_tensor)
                logits1 = agente.policy_net(state1_tensor)
    
                action0 = torch.argmax(logits0, dim=1).item()
                action1 = torch.argmax(logits1, dim=1).item()
    
                obs, reward,shaped_reward, done, info = env.step((action0, action1))

                episode_reward += reward
                i = i+1
    
            average_rewards.append(episode_reward/i)
            total_rewards.append(episode_reward)
            print(f"average reward in episode {episode} : {episode_reward/i}")
    
    
        average_reward = np.mean(total_rewards)
    
        print(f"Test Results over {num_episodes} episodes:")
        print(f"Average Reward: {average_reward:.3f}")
        print(f"Standard Deviation: {np.std(total_rewards):.3f}")

In [10]:
DQNO = OvercookedDQN(["cramped_room"],model_DQN =DQN,gamma = 0.99,lr_model = 0.001,epochs = 15,batch_size = 128,optimizer_class = torch.optim.Adam,TAU=0.005,epsilon_decay=50000,epsilon_start=0.9,epsilon_end=0.1)

Using device: cuda
96


In [11]:
DQNO.trainingLoop(max_episodes=500)

Episode: 1, DQN loss : 6.243980169529095e-05, total reward 17.400000255554914
Episode: 11, DQN loss : 0.10367374122142792, total reward 9.270000126212835
Episode: 21, DQN loss : 0.08854013681411743, total reward 25.29000039026141
Episode: 31, DQN loss : 0.10214544087648392, total reward 8.720000144094229
Soup delivered!
Soup delivered!
Soup delivered!
Episode: 41, DQN loss : 0.08417113125324249, total reward 9.440000128000975
Episode: 51, DQN loss : 0.15642210841178894, total reward 18.200000267475843
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Episode: 61, DQN loss : 0.25932642817497253, total reward 68.1000011190772
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
So

In [12]:
DQNO.SaveModel()

In [13]:
test_agent(DQNO,["cramped_room"],num_episodes=10)

testing for enviroment :cramped_room
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
average reward in episode 0 : 0.56
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
average reward in episode 1 : 0.56
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
average reward in episode 2 : 0.56
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup delivered!
Soup deliv