In [4]:
from collections import defaultdict
import gymnasium as gym
import numpy as np
import math
import itertools
import torch
import torch.nn as nn
import statistics
import torch.nn.functional as F
from exceptiongroup import catch

from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv, Overcooked
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
from overcooked_ai_py.visualization.state_visualizer import StateVisualizer
from overcooked_ai_py.mdp.actions import Action

from tqdm import tqdm

In [15]:
class OvercookedRewardShaping(Overcooked):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def step(self, actions):
        observation, base_reward, done, info = super().step(actions)
        if base_reward != 0:
            print("Soup delivered! Voto: {}".format(base_reward)) # base_reward is 20 if soup is delivered
        shaped_reward = base_reward + self._compute_shaping(observation['both_agent_obs'])
        return observation, shaped_reward, done, info

    def _compute_shaping(self, observations):
        shaping = 0
        for obs in observations:
            holding_vector = obs[4:8]
            holding_soup = obs[5:6]
            soup_full_cooking_ready = obs[24:27]
            soup_empty = obs[23:24]
            soup_cooking = obs[25:26]
            pot_onions = obs[27:28]

            # Penalty if holding an object
            #if holding_vector.any():
            #    shaping -= 0.05
            # Reward if holding a soup
            #if holding_soup.any():
            #    shaping += 0.1
            # Reward if soup is full/cooking/ready
            if soup_cooking.any():
                shaping += 0.3
            # Reward if onion are putted into the soup
            #if soup_empty.any():
            # shaping += int(pot_onions)*0.01

        return shaping

In [141]:


import pygame


class Approximator(nn.Module):
    def __init__(self, action_size,input_size):
        super(Approximator, self).__init__()
        self.dense1 = nn.Linear(in_features=input_size, out_features=64) # You'll need to specify input features here
        self.dense2 = nn.Linear(in_features=64, out_features=128)
        self.dense3 = nn.Linear(in_features=128, out_features=256)
        self.dense4 = nn.Linear(in_features=256, out_features=128)
        self.dense5 = nn.Linear(in_features=128, out_features=64)
        self.policy_logits = nn.Linear(in_features=64, out_features=action_size)
        self.value = nn.Linear(in_features=64, out_features=1)

    def forward(self, state):
        x = F.relu(self.dense1(state))
        x = F.relu(self.dense2(x))
        x = F.relu(self.dense3(x))
        x = F.relu(self.dense4(x))
        x = F.relu(self.dense5(x))
        logits = self.policy_logits(x)
        value = self.value(x)
        return logits, value



class OvercookedPPO:
    def __init__(
        self,
        layout_name,
        model_class,
        gamma,  # Discount factor
        lr_actor,  # Actor learning rate
        lr_critic,  # Critic learning rate
        clip_ratio, # PPO clip ratio
        epochs, # Number of optimization epochs
        batch_size,
        optimizer_class,
        lmbda

    ):
        self.gamma = gamma
        self.lmbda = lmbda
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.clip_ratio = clip_ratio
        self.epochs = epochs
        self.batch_size = batch_size

        self.individual_action_values = Action.ALL_ACTIONS

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")



        base_mdp = OvercookedGridworld.from_layout_name(layout_name) # or other layout
        base_env = OvercookedEnv.from_mdp(base_mdp, info_level=0, horizon=400)
        #env = Overcooked(base_env=base_env, featurize_fn=base_env.featurize_state_mdp)
        self.env  = OvercookedRewardShaping(base_env=base_env, featurize_fn=base_env.featurize_state_mdp)

        self.n_possible_action = self.env.action_space.n

        dummy_state = self.env.reset()
        dummy_obs_agent0 = dummy_state['both_agent_obs'][0]
        state_input_size = len(dummy_obs_agent0) # Assuming the featurized state is a flat vector

        self.actor_model = model_class(self.n_possible_action, state_input_size).to(self.device)
        self.critic_model = model_class(self.n_possible_action, state_input_size).to(self.device)
        self.actor_optimizer = optimizer_class(self.actor_model.parameters(), lr=lr_actor)
        self.critic_optimizer = optimizer_class(self.critic_model.parameters(), lr=lr_actor)




    def ppo_loss(self,old_logits, old_values, returns, states, actions,dones):

        old_logits0_initial, old_logits1_initial = old_logits
        old_values0_initial, old_values1_initial = old_values

        def get_advantages_gae(values_tensor, masks_tensor, rewards_tensor):
            # values_tensor should be (batch_size, 1), squeeze it
            values_squeezed = values_tensor.squeeze(-1) # shape (batch_size,)

            returns = torch.zeros_like(rewards_tensor).to(self.device) # Initialize returns on GPU
            gae = 0

            advantages_tensor = torch.zeros_like(rewards_tensor).to(self.device)
            last_gae_lam = 0
            for t in reversed(range(len(rewards_tensor))):
                if t == len(rewards_tensor) - 1: # Last step in the collected trajectory

                    next_value = 0.0 # If episode is truly done, next value is 0
                else:
                    next_value = values_squeezed[t+1] # V(s_{t+1})

                delta = rewards_tensor[t] + self.gamma * next_value * masks_tensor[t] - values_squeezed[t]
                last_gae_lam = delta + self.gamma * self.lmbda * masks_tensor[t] * last_gae_lam
                advantages_tensor[t] = last_gae_lam

            returns_gae = advantages_tensor + values_squeezed

            # Normalize advantages
            advantages_normalized = (advantages_tensor - advantages_tensor.mean()) / (advantages_tensor.std() + 1e-10)

            return returns_gae, advantages_normalized


        def compute_loss(logits, values, actions, returns,old_logits,advantages):
            actions_onehot = F.one_hot(actions.long(), num_classes=self.n_possible_action).float()

            policy = F.softmax(logits, dim=-1)
            action_probs = torch.sum(actions_onehot * policy, dim=-1) # Use dim=-1 for last dimension

            old_policy = F.softmax(old_logits.detach(), dim=-1)
            old_action_probs = torch.sum(actions_onehot * old_policy, dim=-1)

            epsilon = 1e-10
            # Policy loss

            advantages_tensor = torch.tensor(advantages, dtype=torch.long).to(self.device)
            ratio = torch.exp(torch.log(action_probs + 1e-10) - torch.log(old_action_probs + 1e-10))
            clipped_ratio = torch.clamp(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio)
            policy_loss = torch.mean(torch.min(ratio * advantages_tensor, clipped_ratio * advantages_tensor))

            # Value loss
            returns_tensor = torch.tensor(returns, dtype=torch.long).to(self.device)
            value_loss = torch.mean(torch.square(values.squeeze(-1) - returns_tensor)) # Squeeze value to match returns shape

            # Entropy bonus (optional)
            # Ensure policy is not zero for log
            entropy_bonus = torch.mean(policy * torch.log(policy + epsilon)) # PyTorch's entropy loss is typically negative

            total_loss = policy_loss + 0.1 * entropy_bonus # Note: In PyTorch, entropy_bonus typically added for maximization, so + sign. If you want regularization that penalizes low entropy, it's typically subtracted like in TF. Let's keep your original -0.01 for regularization.
            return total_loss, value_loss

        def train_step(states, actions, returns, old_logits, old_values, advantages_tuple):
            self.actor_optimizer.zero_grad() # Zero gradients for this optimization step
            self.critic_optimizer.zero_grad()

            logits0, _ = self.actor_model(states[0])
            _, values0 = self.critic_model(states[0])
            logits1, _ = self.actor_model(states[1])
            _, values1 = self.critic_model(states[1])

            policy_loss0, value_loss0 = compute_loss(logits0, values0, actions[0], returns, old_logits[0], advantages_tuple[0])

            policy_loss1, value_loss1 = compute_loss(logits1, values1, actions[1], returns, old_logits[1], advantages_tuple[1])

            total_value_loss = value_loss0 + value_loss1
            total_policy_loss = policy_loss0 + policy_loss1

            total_value_loss.backward()
            total_policy_loss.backward() # Compute gradients

            self.actor_optimizer.step() # Zero gradients for this optimization step
            self.critic_optimizer.step() # Update model parameters

            return total_value_loss.item(), total_policy_loss.item() # Return scalar loss values

        current_loss0, current_loss1 = 0, 0
        rewards_tensor = torch.tensor(returns, dtype=torch.float32).to(self.device)
        masks_tensor = torch.tensor(dones, dtype=torch.float32).to(self.device) # `dones_list` is `not done` (1 for non-terminal, 0 for terminal)
        returns0_gae, advantages0 = get_advantages_gae(old_values0_initial, masks_tensor, rewards_tensor)
        returns1_gae, advantages1 = get_advantages_gae(old_values1_initial, masks_tensor, rewards_tensor)
        for _ in range(self.epochs):
            # Pass all initial data, the inner train_step will recompute current model outputs
            current_loss0, current_loss1 = train_step(
                (states[0], states[1]),
                (actions[0], actions[1]),
                returns,
                (old_logits0_initial, old_logits1_initial),
                (old_values0_initial, old_values1_initial),
                (advantages0, advantages1)
            )

        return current_loss0, current_loss1


    def trainingLoop(self,max_episodes,max_steps_per_episode):
        for episode in range(max_episodes):
            states0,states1, actions0, actions1, rewards, values1,values0, returns,dones = [], [], [], [], [],[],[],[],[]
            state = self.env.reset()
            for step in range(max_steps_per_episode):
                state0_tensor = torch.tensor(state['both_agent_obs'][0], dtype=torch.float32).unsqueeze(0).to(self.device)
                state1_tensor = torch.tensor(state['both_agent_obs'][1], dtype=torch.float32).unsqueeze(0).to(self.device)

                self.actor_model.eval()
                self.critic_model.eval()
                with torch.no_grad(): # No need to track gradients for action selection
                    logits0, _ = self.actor_model(state0_tensor)
                    _, value0 = self.critic_model(state0_tensor)
                    logits1, _ = self.actor_model(state1_tensor)
                    _, value1 = self.critic_model(state1_tensor)
                self.actor_model.train()
                self.critic_model.train() # Sw

                action_dist0 = torch.distributions.Categorical(logits=logits0)
                action0 = action_dist0.sample().item() # Get scalar action

                action_dist1 = torch.distributions.Categorical(logits=logits1)
                action1 = action_dist1.sample().item()

                action = (action0, action1)
                next_state, reward, done, event_info = self.env.step(action)

                states0.append(state0_tensor)
                states1.append(state1_tensor)
                actions0.append(action0)
                actions1.append(action1)
                rewards.append(reward)
                values0.append(value0)
                values1.append(value1)
                dones.append(not done)

                state = next_state

                if done:


                    # Convert lists of tensors/numpy arrays to batched tensors
                    states0_batch = torch.cat(states0, dim=0).to(self.device)
                    states1_batch = torch.cat(states1, dim=0).to(self.device)

                    actions0_batch = torch.tensor(actions0, dtype=torch.long).to(self.device) # Actions should be long for indexing/one-hot
                    actions1_batch = torch.tensor(actions1, dtype=torch.long).to(self.device)

                    # Values are already tensors from model output, concatenate them
                    values0_batch = torch.cat(values0, dim=0).to(self.device)
                    values1_batch = torch.cat(values1, dim=0).to(self.device)

                    old_logits0_batch, _ = self.actor_model(states0_batch)
                    old_logits1_batch, _ = self.actor_model(states1_batch)

                    total_value_loss, total_policy_loss = self.ppo_loss(
                        (old_logits0_batch, old_logits1_batch),
                        (values0_batch, values1_batch),
                        rewards,
                        (states0_batch, states1_batch),
                        (actions0_batch, actions1_batch),
                        dones
                    )
                    if episode % 10 == 0:
                        print(f"Episode: {episode + 1}, value_loss : {total_value_loss}, policy_loss : {total_policy_loss}, average reward {statistics.fmean(rewards)}")

                    break
    def test(self, n_episodes, visualize=False, print_action=False):
        pygame.init()
        visualizer = StateVisualizer()

        # 2) Grab your grid and do one dummy render to get a surface
        grid = self.env.base_env.mdp.terrain_mtx
        _ = self.env.reset()
        surf = visualizer.render_state(self.env.base_env.state, grid=grid)

        # 3) Use that surface’s size for your window
        win_w, win_h = surf.get_size()
        screen = pygame.display.set_mode((win_w, win_h), pygame.RESIZABLE)
        clock  = pygame.time.Clock()

        running = True
        obs = self.env.reset() #observation of the starting state
        soup_delivered = 0

        total_rewards = []
        while running:
            for ev in pygame.event.get():
                if ev.type == pygame.QUIT:
                    running = False

            self.actor_model.eval()
            self.critic_model.eval() # Set model to evaluation mode
            with torch.no_grad(): # No gradient calculation during testing
                state0_tensor = torch.tensor(obs['both_agent_obs'][0], dtype=torch.float32).unsqueeze(0).to(self.device)
                state1_tensor = torch.tensor(obs['both_agent_obs'][1], dtype=torch.float32).unsqueeze(0).to(self.device)

                logits0, _ = self.actor_model(state0_tensor)
                logits1, _ = self.actor_model(state1_tensor)

                action0 = torch.argmax(logits0, dim=1).item()
                action1 = torch.argmax(logits1, dim=1).item()

                if print_action:
                    print(action0, action1)
                # try to step; if episode is over, catch and reset
                try:
                    # Overcooked wrapper returns (obs_p0, obs_p1, reward, done, info)
                    observation, reward, done, info = self.env.step((action0, action1))
                    if reward > 19:
                        soup_delivered += 1
                except AssertionError:
                    # base_env.is_done() was True → reset and continue
                    self.env.reset()
                    break

                # render the new state
                surf = visualizer.render_state(self.env.base_env.state, grid=grid)

                # draw it
                screen.blit(surf, (0, 0))
                pygame.display.flip()

                clock.tick(15)   # cap at 30 FPS

        pygame.quit()

        print(f"Soup delivered: {soup_delivered}")








In [142]:
OPPO = OvercookedPPO("cramped_room",model_class =Approximator,gamma = 0.99,lr_actor = 0.0001,lr_critic = 0.0001,clip_ratio = 0.2 ,epochs = 15,batch_size = 64,optimizer_class = torch.optim.Adam,lmbda=0.95)

Using device: cuda


In [145]:
OPPO.trainingLoop(max_episodes = 100,max_steps_per_episode = 2000)

  advantages_tensor = torch.tensor(advantages, dtype=torch.long).to(self.device)


Episode: 1, value_loss : 1.0180804110859754e-06, policy_loss : 0.29167038202285767, average reward 0.0285
Episode: 11, value_loss : 7.719327186350711e-07, policy_loss : 0.36793777346611023, average reward 0.0285
Episode: 21, value_loss : 5.586688303083065e-07, policy_loss : 0.3553040027618408, average reward 0.0285


KeyboardInterrupt: 

In [144]:
OPPO.test(n_episodes = 100,visualize = True,print_action = True)

2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
2 2
Soup delivered: 0
