In [1]:
!git clone https://github.com/HumanCompatibleAI/overcooked_ai.git
!gdown --id 1APcOaFTbxT6_JrCMeoqxZMECWEEILoSb
!uv sync --project overcooked_ai/
import sys
sys.path.append("/kaggle/working/overcooked_ai/src")

Cloning into 'overcooked_ai'...
remote: Enumerating objects: 7950, done.[K
remote: Total 7950 (delta 0), reused 0 (delta 0), pack-reused 7950 (from 1)[K
Receiving objects: 100% (7950/7950), 524.63 MiB | 18.80 MiB/s, done.
Resolving deltas: 100% (4549/4549), done.
Updating files: 100% (402/402), done.
Downloading...
From: https://drive.google.com/uc?id=1APcOaFTbxT6_JrCMeoqxZMECWEEILoSb
To: /kaggle/working/DQNPolicy_net.pth
100%|███████████████████████████████████████| 88.1k/88.1k [00:00<00:00, 764kB/s]
Using CPython 3.10.12 interpreter at: [36m/usr/bin/python3.10[39m
Creating virtual environment at: [36movercooked_ai/.venv[39m
[2K[2mResolved [1m137 packages[0m [2min 4.48s[0m[0m
[2K   [36m[1mBuilding[0m[39m overcooked-ai[2m @ file:///kaggle/working/overcooked_ai[0m
[2K[1A   [36m[1mBuilding[0m[39m overcooked-ai[2m @ file:///kaggle/working/overcooked_ai[0m
[37m⠙[0m [2mPreparing packages...[0m (0/32)
[2K[2A   [36m[1mBuilding[0m[39m overc

In [2]:
from collections import defaultdict
import gymnasium as gym
import numpy as np
import math
import itertools
import torch
import torch.nn as nn
import statistics
import torch.nn.functional as F
import pygame


from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv, Overcooked
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
from overcooked_ai_py.visualization.state_visualizer import StateVisualizer
from overcooked_ai_py.mdp.actions import Action

from tqdm import tqdm

In [3]:
class OvercookedRewardShaping(Overcooked):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.prev_agent_obs = [None, None] 

    def step(self, actions):
        observation, base_reward, done, info = super().step(actions)

        shaped_reward_total = 0
        current_agent_obs = observation['both_agent_obs']

        if self.prev_agent_obs[0] is None:
            self.prev_agent_obs = current_agent_obs

        for i, obs in enumerate(current_agent_obs):
            shaped_reward_total += self._compute_agent_shaping(obs, self.prev_agent_obs[i])

        self.prev_agent_obs = current_agent_obs

        if base_reward != 0:
             print(f"Soup delivered!")

        return observation, base_reward,shaped_reward_total, done, info

    def _compute_agent_shaping(self, current_obs, prev_obs):
        shaping = 0.0

        onion_idx = 0
        soup_idx = 1
        dish_idx = 2
        tomato_idx = 3

        pot_full_idx = 1
        pot_cooking_idx = 2
        pot_ready_idx = 3

        prev_holding_vector = prev_obs[4:8]
        current_holding_vector = current_obs[4:8]

        if prev_holding_vector.sum() == 0 and current_holding_vector.sum() == 1:
            if current_holding_vector[onion_idx] == 1 or current_holding_vector[tomato_idx] == 1:
                shaping += 0.05 
            elif current_holding_vector[dish_idx] == 1:
                shaping += 0.02 

        prev_pot_onions = prev_obs[27:28][0] 
        current_pot_onions = current_obs[27:28][0]
        prev_pot_tomatoes = prev_obs[28:29][0]
        current_pot_tomatoes = current_obs[28:29][0]

        if current_pot_onions > prev_pot_onions:
            shaping += 0.1 
        if current_pot_tomatoes > prev_pot_tomatoes:
            shaping += 0.1 

        prev_pot_states = prev_obs[23:27]
        current_pot_states = current_obs[23:27]

        if prev_pot_states[pot_full_idx] == 1 and current_pot_states[pot_cooking_idx] == 1:
            shaping += 0.2 

        if prev_pot_states[pot_cooking_idx] == 1 and current_pot_states[pot_ready_idx] == 1:
            shaping += 0.3

        if prev_pot_states[pot_ready_idx] == 1 and current_holding_vector[soup_idx] == 1 and prev_holding_vector[soup_idx] == 0:
            shaping += 0.25 


        current_dx_serving = abs(current_obs[16:17][0])
        current_dy_serving = abs(current_obs[17:18][0]) 
        prev_dx_serving = abs(prev_obs[16:17][0])
        prev_dy_serving = abs(prev_obs[17:18][0])

        current_dist_serving = current_dx_serving + current_dy_serving 
        prev_dist_serving = prev_dx_serving + prev_dy_serving

        if current_holding_vector[soup_idx] == 1 and current_dist_serving < prev_dist_serving:
            shaping += 0.01

        return shaping

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Actor(nn.Module):
    def __init__(self, action_size,input_size):
        super(Actor, self).__init__()
        self.dense1 = nn.Linear(in_features=input_size, out_features=64)
        self.dense2 = nn.Linear(in_features=64, out_features=128)
        self.dense3 = nn.Linear(in_features=128, out_features=64)
        self.policy_logits = nn.Linear(in_features=64, out_features=action_size)


    def forward(self, state):
        x = F.relu(self.dense1(state))
        x = F.relu(self.dense2(x))
        x = F.relu(self.dense3(x))
        logits = self.policy_logits(x)
        return logits

class Critic(nn.Module):
    def __init__(self, action_size,input_size):
        super(Critic, self).__init__()
        self.dense1 = nn.Linear(in_features=input_size, out_features=32) 
        self.dense2 = nn.Linear(in_features=32, out_features=64)
        self.dense3 = nn.Linear(in_features=64, out_features=128)
        self.dense4 = nn.Linear(in_features=128, out_features=64)
        self.value = nn.Linear(in_features=64, out_features=1)

    def forward(self, state):
        x = F.relu(self.dense1(state))
        x = F.relu(self.dense2(x))
        x = F.relu(self.dense3(x))
        x = F.relu(self.dense4(x))
        value = self.value(x)
        return value

class OvercookedPPO:
    def __init__(
        self,
        layout_name, #name of the enviroment
        model_actor, # neural network of the actor
        model_critic, # neural network of the critic
        gamma,  # discount factor of the rewards
        lr_actor,  # learning rate of the actor network
        lr_critic,  # learning rate of the critic network
        clip_ratio, # clip ratio of the policy
        epochs, # Number of epoch
        batch_size, # batch of samples for the optimization
        optimizer_class, # optimizer
        lmbda, # lambda of the advantages
        entropy_coefficient # coefficent of entropy

    ):
        self.gamma = gamma
        self.lmbda = lmbda
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.clip_ratio = clip_ratio
        self.epochs = epochs
        self.batch_size = batch_size
        self.entropy_coefficient = entropy_coefficient

        self.device = device
        print(f"Using device: {self.device}")

        base_mdp = OvercookedGridworld.from_layout_name(layout_name)
        base_env = OvercookedEnv.from_mdp(base_mdp, info_level=0, horizon=400)
        self.env  = OvercookedRewardShaping(base_env=base_env, featurize_fn=base_env.featurize_state_mdp)

        self.n_possible_action = self.env.action_space.n

        dummy_state = self.env.reset()
        dummy_obs_agent0 = dummy_state['both_agent_obs'][0]
        state_input_size = len(dummy_obs_agent0) 

        self.actor_model = model_actor(self.n_possible_action, state_input_size).to(self.device)
        self.critic_model = model_critic(self.n_possible_action, state_input_size).to(self.device)
        self.actor_optimizer = optimizer_class(self.actor_model.parameters(), lr=lr_actor)
        self.critic_optimizer = optimizer_class(self.critic_model.parameters(), lr=lr_critic)




    def ppo_loss(self,old_logits, old_values, returns, states, actions,dones):

        old_logits0_initial, old_logits1_initial = old_logits
        old_values0_initial, old_values1_initial = old_values

        def get_advantages_gae(values_tensor, masks_tensor, rewards_tensor):
            values_squeezed = values_tensor.squeeze(-1) 
            advantages_tensor = torch.zeros_like(rewards_tensor).to(self.device)
            last_gae_lam = 0
            for t in reversed(range(len(rewards_tensor))):
                if t == len(rewards_tensor) - 1:

                    next_value = 0.0
                else:
                    next_value = values_squeezed[t+1] 

                delta = rewards_tensor[t] + self.gamma * next_value * masks_tensor[t] - values_squeezed[t]
                last_gae_lam = delta + self.gamma * self.lmbda * masks_tensor[t] * last_gae_lam
                advantages_tensor[t] = last_gae_lam

            returns_gae = advantages_tensor + values_squeezed

            advantages_normalized = (advantages_tensor - advantages_tensor.mean()) / (advantages_tensor.std() + 1e-10)

            return returns_gae, advantages_normalized


        def compute_loss(logits, values, actions, returns,old_logits,advantages):
            actions_onehot = F.one_hot(actions.long(), num_classes=self.n_possible_action).float()

            policy = F.softmax(logits, dim=-1)
            action_probs = torch.sum(actions_onehot * policy, dim=-1) 

            old_policy = F.softmax(old_logits.detach(), dim=-1)
            old_action_probs = torch.sum(actions_onehot * old_policy, dim=-1)

            epsilon = 1e-10

            ratio = torch.exp(torch.log(action_probs + 1e-10) - torch.log(old_action_probs + 1e-10))
            clipped_ratio = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio)
            policy_loss = -torch.mean(torch.min(ratio * advantages, clipped_ratio * advantages))

            returns_tensor = torch.tensor(returns, dtype=torch.float32).to(self.device)
            value_loss = torch.mean(torch.square(values.squeeze(-1) - returns_tensor)) 

            entropy_bonus = torch.mean(policy * torch.log(policy + epsilon))

            total_loss = policy_loss + self.entropy_coefficient * entropy_bonus 
            return total_loss, value_loss

        def train_step(states, actions, returns, old_logits, old_values, advantages_tuple):
            self.actor_optimizer.zero_grad() 
            self.critic_optimizer.zero_grad()

            logits0 = self.actor_model(states[0])
            values0 = self.critic_model(states[0])
            logits1 = self.actor_model(states[1])
            values1 = self.critic_model(states[1])

            policy_loss0, value_loss0 = compute_loss(logits0, values0, actions[0], returns, old_logits[0], advantages_tuple[0])

            policy_loss1, value_loss1 = compute_loss(logits1, values1, actions[1], returns, old_logits[1], advantages_tuple[1])

            total_value_loss = value_loss0 + value_loss1
            total_policy_loss = policy_loss0 + policy_loss1

            total_value_loss.backward()
            total_policy_loss.backward() 

            self.actor_optimizer.step() 
            self.critic_optimizer.step() 

            return total_value_loss.item(), total_policy_loss.item() 

        current_loss0, current_loss1 = 0, 0
        rewards_tensor = torch.tensor(returns, dtype=torch.float32).to(self.device)
        masks_tensor = torch.tensor(dones, dtype=torch.float32).to(self.device) 
        returns0_gae, advantages0 = get_advantages_gae(old_values0_initial, masks_tensor, rewards_tensor)
        returns1_gae, advantages1 = get_advantages_gae(old_values1_initial, masks_tensor, rewards_tensor)
        for _ in range(self.epochs):
            current_loss0, current_loss1 = train_step(
                (states[0], states[1]),
                (actions[0], actions[1]),
                returns,
                (old_logits0_initial, old_logits1_initial),
                (old_values0_initial, old_values1_initial),
                (advantages0, advantages1)
            )

        return current_loss0, current_loss1


    def trainingLoop(self,max_episodes,max_steps_per_episode):
        for episode in range(max_episodes):
            self.actor_model.eval()
            self.critic_model.eval()
            states0,states1, actions0, actions1, rewards, values1,values0, returns,dones = [], [], [], [], [],[],[],[],[]
            state = self.env.reset()
            episode_rewards = 0
            for step in range(max_steps_per_episode):
                state0_tensor = torch.tensor(state['both_agent_obs'][0], dtype=torch.float32).unsqueeze(0).to(self.device)
                state1_tensor = torch.tensor(state['both_agent_obs'][1], dtype=torch.float32).unsqueeze(0).to(self.device)

                with torch.no_grad(): 
                    logits0 = self.actor_model(state0_tensor)
                    value0 = self.critic_model(state0_tensor)
                    logits1 = self.actor_model(state1_tensor)
                    value1 = self.critic_model(state1_tensor)
                self.actor_model.train()
                self.critic_model.train() 

                action_dist0 = torch.distributions.Categorical(logits=logits0)
                action0 = action_dist0.sample().item()

                action_dist1 = torch.distributions.Categorical(logits=logits1)
                action1 = action_dist1.sample().item()

                action = (action0, action1)
                next_state, reward,shaped_reward, done, event_info = self.env.step(action)


                reward += shaped_reward

                if (state['both_agent_obs'][0] == next_state['both_agent_obs'][0]).all() or (state['both_agent_obs'][1] == next_state['both_agent_obs'][1]).all():
                    reward += -0.01 * step

                states0.append(state0_tensor)
                states1.append(state1_tensor)
                actions0.append(action0)
                actions1.append(action1)
                rewards.append(reward)
                values0.append(value0)
                values1.append(value1)
                dones.append(not done)

                state = next_state
                episode_rewards += reward

                if done:

                    states0_batch = torch.cat(states0, dim=0).to(self.device)
                    states1_batch = torch.cat(states1, dim=0).to(self.device)

                    actions0_batch = torch.tensor(actions0, dtype=torch.float32).to(self.device) 
                    actions1_batch = torch.tensor(actions1, dtype=torch.float32).to(self.device)

                    values0_batch = torch.cat(values0, dim=0).to(self.device)
                    values1_batch = torch.cat(values1, dim=0).to(self.device)

                    old_logits0_batch = self.actor_model(states0_batch)
                    old_logits1_batch = self.actor_model(states1_batch)

                    total_value_loss, total_policy_loss = self.ppo_loss(
                        (old_logits0_batch, old_logits1_batch),
                        (values0_batch, values1_batch),
                        rewards,
                        (states0_batch, states1_batch),
                        (actions0_batch, actions1_batch),
                        dones
                    )
                    if episode % 10 == 0:
                        print(f"Episode: {episode + 1}, value_loss : {total_value_loss}, policy_loss : {total_policy_loss}, episode reward {episode_rewards}")

                    break

In [5]:
# Test the trained agent
def test_agent(agente, num_episodes=1000):
    """Test agent performance without learning or exploration."""
    total_rewards = []

    for episode in tqdm(range(num_episodes)):
        obs = agente.env.reset()
        episode_reward = 0
        done = False

        while not done:
            state0_tensor = torch.tensor(obs['both_agent_obs'][0], dtype=torch.float32).unsqueeze(0).to(device)
            state1_tensor = torch.tensor(obs['both_agent_obs'][1], dtype=torch.float32).unsqueeze(0).to(device)

            logits0 = agente.actor_model(state0_tensor)
            logits1 = agente.actor_model(state1_tensor)

            action0 = torch.argmax(logits0, dim=1).item()
            action1 = torch.argmax(logits1, dim=1).item()

            obs, reward, shaped_reward, done, info = agente.env.step((action0, action1))

            episode_reward += reward

        total_rewards.append(episode_reward)

    average_reward = np.mean(total_rewards)

    print(f"Test Results over {num_episodes} episodes:")
    print(f"Average Reward: {average_reward:.3f}")
    print(f"Standard Deviation: {np.std(total_rewards):.3f}")

In [6]:
OPPO = OvercookedPPO("cramped_room",model_actor =Actor,model_critic=Critic,gamma = 0.99,lr_actor = 0.001,lr_critic = 0.0005,clip_ratio = 0.2 ,epochs = 15,batch_size = 128,optimizer_class = torch.optim.Adam,lmbda=0.95,entropy_coefficient=0.2)

Using device: cuda
Computing MotionPlanner to be saved in /kaggle/working/overcooked_ai/src/overcooked_ai_py/data/planners/cramped_room_mp.pkl
It took 0.026651620864868164 seconds to create mp


In [7]:
OPPO.trainingLoop(max_episodes = 1000,max_steps_per_episode = 2000)

Episode: 1, value_loss : 1.9823198318481445, policy_loss : -0.16049012541770935, episode reward -163.13
Episode: 11, value_loss : 0.8477990627288818, policy_loss : -0.188412606716156, episode reward -58.370000000000005
Episode: 21, value_loss : 0.4277401566505432, policy_loss : -0.169328510761261, episode reward -36.01
Episode: 31, value_loss : 0.5191795229911804, policy_loss : -0.17575451731681824, episode reward -24.53
Episode: 41, value_loss : 0.42362141609191895, policy_loss : -0.1858307421207428, episode reward -19.680000000000028
Episode: 51, value_loss : 0.23293337225914001, policy_loss : -0.171863853931427, episode reward -0.8399999999999954
Episode: 61, value_loss : 0.28976595401763916, policy_loss : -0.16764172911643982, episode reward -18.889999999999997
Episode: 71, value_loss : 0.12380170822143555, policy_loss : -0.15746110677719116, episode reward -6.419999999999998
Episode: 81, value_loss : 0.22865428030490875, policy_loss : -0.16392287611961365, episode reward -3.830000

In [8]:
test_agent(OPPO,10)

100%|██████████| 10/10 [00:04<00:00,  2.09it/s]

Test Results over 10 episodes:
Average Reward: 0.000
Standard Deviation: 0.000



