In [2]:
from collections import defaultdict
import gymnasium as gym
import numpy as np
import tensorflow as tf
import math
import itertools

from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv, Overcooked
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
from overcooked_ai_py.visualization.state_visualizer import StateVisualizer
from overcooked_ai_py.mdp.actions import Action

from tqdm import tqdm

In [70]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

class Approximator(tf.keras.Model):
    def __init__(self, action_size):
        super(Approximator, self).__init__()
        self.dense1 = tf.keras.layers.Dense(64, activation='relu')
        self.dense2 = tf.keras.layers.Dense(128, activation='relu')
        self.dense3 = tf.keras.layers.Dense(256, activation='relu')
        self.dense4 = tf.keras.layers.Dense(128, activation='relu')
        self.dense5 = tf.keras.layers.Dense(64, activation='relu')
        self.policy_logits = tf.keras.layers.Dense(action_size)
        self.value = tf.keras.layers.Dense(1)

    def call(self, state):
        x = self.dense1(state)
        x = self.dense2(x)
        x = self.dense3(x)
        x = self.dense4(x)
        x = self.dense5(x)
        logits = self.policy_logits(x)
        value = self.value(x)
        return logits, value



class OvercookedPPO:
    def __init__(
        self,
        layout_name,
        model,
        gamma,  # Discount factor
        lr_actor,  # Actor learning rate
        lr_critic,  # Critic learning rate
        clip_ratio, # PPO clip ratio
        epochs, # Number of optimization epochs
        batch_size,
        optimizer

    ):
        self.gamma = gamma
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.clip_ratio = clip_ratio
        self.epochs = epochs
        self.batch_size = batch_size
        #self.possible_action = [*itertools.product(range(6),repeat=2)]
        self.optimizer = optimizer

        self.individual_action_values = Action.ALL_ACTIONS


        self.possible_action = list(range(len(self.individual_action_values)))



        base_mdp = OvercookedGridworld.from_layout_name(layout_name) # or other layout
        base_env = OvercookedEnv.from_mdp(base_mdp, info_level=0, horizon=400)
        env = Overcooked(base_env=base_env, featurize_fn=base_env.featurize_state_mdp)
        self.env = env

        self.model = model(len(self.possible_action))




    def ppo_loss(self,old_logits, old_values, advantages, states, actions, returns):
        def compute_loss(logits, values, actions, returns,old_logits,advantages):
            actions_onehot = tf.one_hot(actions, len(self.possible_action), dtype=tf.float32)
            policy = tf.nn.softmax(logits)
            action_probs = tf.reduce_sum(actions_onehot * policy, axis=1)
            old_policy = tf.nn.softmax(old_logits)
            old_action_probs = tf.reduce_sum(actions_onehot * old_policy, axis=1)

            # Policy loss
            ratio = tf.exp(tf.math.log(action_probs + 1e-10) - tf.math.log(old_action_probs + 1e-10))
            clipped_ratio = tf.clip_by_value(ratio, 1 - self.clip_ratio, 1 + self.clip_ratio)
            policy_loss = -tf.reduce_mean(tf.minimum(ratio * advantages, clipped_ratio * advantages))

            # Value loss
            value_loss = tf.reduce_mean(tf.square(values - returns))

            # Entropy bonus (optional)
            entropy_bonus = tf.reduce_mean(policy * tf.math.log(policy + 1e-10))

            total_loss = policy_loss + 0.5 * value_loss - 0.01 * entropy_bonus# Entropy regularization
            return total_loss

        def get_advantages(returns, values):
            advantages = returns - values[0]
            return (advantages - tf.reduce_mean(advantages)) / (tf.math.reduce_std(advantages) + 1e-8)

        def train_step(states, actions, returns, old_logits, old_values,advantages):
            with tf.GradientTape() as tape:
                logits, values = self.model(states[0])
                loss0 = compute_loss(logits, values, actions[0], returns,old_logits[0],advantages[0])
            gradients = tape.gradient(loss0, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

            with tf.GradientTape() as tape:
                logits, values = self.model(states[1])
                loss1 = compute_loss(logits, values, actions[1], returns,old_logits[1],advantages[1])
            gradients = tape.gradient(loss1, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
            return loss0, loss1

        advantages0 = get_advantages(returns[0], old_values[0])
        advantages1 = get_advantages(returns[1], old_values[1])
        for _ in range(self.epochs):
            loss0,loss1 = train_step(states, actions, returns, old_logits, old_values,(advantages0,advantages1))

        return loss0,loss1


    def trainingLoop(self,max_episodes,max_steps_per_episode):
        for episode in range(max_episodes):
            states0,states1, actions0, actions1, rewards, values1,values0, returns = [], [], [], [], [],[],[],[]
            state = self.env.reset()
            for step in range(max_steps_per_episode):
                state0 = tf.expand_dims(tf.convert_to_tensor(state['both_agent_obs'][0]), 0)
                logits0, value0 = self.model(state0)

                state1 = tf.expand_dims(tf.convert_to_tensor(state['both_agent_obs'][1]), 0)
                logits1, value1 = self.model(state1)

                # Sample action from the policy distribution
                action0 = tf.random.categorical(logits0, 1)[0, 0].numpy()
                action1 = tf.random.categorical(logits1, 1)[0, 0].numpy()
                action = (action0, action1)
                next_state, reward, done, _ = self.env.step(action)

                states0.append(state0)
                states1.append(state1)
                actions0.append(action0)
                actions1.append(action1)
                rewards.append(reward)
                values0.append(value0)
                values1.append(value1)

                state = next_state

                if done:
                    returns_batch = []
                    discounted_sum = 0
                    for r in rewards[::-1]:
                        discounted_sum = r + self.gamma * discounted_sum
                        returns_batch.append(discounted_sum)
                    returns_batch.reverse()


                    states0 = tf.concat(states0, axis=0)
                    states1 = tf.concat(states1, axis=0)

                    actions0 = np.array(actions0, dtype=np.int32)
                    actions1 = np.array(actions1, dtype=np.int32)
                    values0 = tf.concat(values0, axis=0)
                    values1 = tf.concat(values1, axis=0)
                    returns_batch = tf.convert_to_tensor(returns_batch)
                    old_logits0, _ = self.model(states0)
                    old_logits1, _ = self.model(states1)

                    loss0,loss1 = self.ppo_loss((old_logits0,old_logits1), (values0,values1), (returns_batch - np.array(values0),returns_batch - np.array(values1)), (state0,state1), (actions0,actions1), returns_batch)
                    if episode % 10 == 0:
                        print(f"Episode: {episode + 1}, Loss agente 0: {loss0.numpy()}, Loss agente 1 : {loss1.numpy()}")

                    break
    def test(self,n_episodes,visualize = False,print_action = False):
        total_rewards = []
        for _ in tqdm(range(n_episodes)):
            episode_reward = 0
            obs = self.env.reset()
            done = False


            while not done:

                state0 = tf.expand_dims(tf.convert_to_tensor(obs['both_agent_obs'][0]), 0)
                logits0, value0 = self.model(state0)
                action0 = tf.argmax(logits0, axis=1).numpy()
                state1 = tf.expand_dims(tf.convert_to_tensor(obs['both_agent_obs'][1]), 0)
                logits1, value1 = self.model(state1)
                action1 = tf.argmax(logits1, axis=1).numpy()
                if print_action:
                    print(logits0, logits1)
                next_state, reward, done, _ = self.env.step((action0[0],action1[0]))
                if visualize:
                    StateVisualizer().display_rendered_state(obs['overcooked_state'],window_display=True,grid=self.env.mdp.terrain_mtx)

                episode_reward += reward
                obs = next_state

            total_rewards.append(episode_reward)

        win_rate = np.mean(np.array(total_rewards) > 0)
        average_reward = np.mean(total_rewards)

        print(f"Test Results over {n_episodes} episodes:")
        print(f"Win Rate: {win_rate:.1%}")
        print(f"Average Reward: {average_reward:.3f}")
        print(f"Standard Deviation: {np.std(total_rewards):.3f}")








[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2669445758471651017
xla_global_id: -1
]


In [73]:
OPPO = OvercookedPPO("cramped_room",model =Approximator,gamma = 0.99,lr_actor = 0.001,lr_critic = 0.001,clip_ratio = 0.1 ,epochs = 15,batch_size = 64,optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01))

In [None]:
OPPO.trainingLoop(max_episodes = 1000,max_steps_per_episode = 1000)

In [67]:
OPPO.test(n_episodes = 100,visualize = True,print_action = True)

  0%|          | 0/100 [00:00<?, ?it/s]

tf.Tensor([[-5.8603015  9.37786   -2.6900256 -5.7112374 -0.9280409 -5.2952785]], shape=(1, 6), dtype=float32) tf.Tensor([[-6.005445   9.60337   -2.754912  -5.8545213 -0.9485752 -5.420971 ]], shape=(1, 6), dtype=float32)
tf.Tensor([[-6.4971676 10.395621  -2.9890025 -6.326336  -1.0471658 -5.862519 ]], shape=(1, 6), dtype=float32) tf.Tensor([[-6.617558  10.580135  -3.0400262 -6.439238  -1.0625697 -5.959219 ]], shape=(1, 6), dtype=float32)
tf.Tensor([[-6.4971676 10.395621  -2.9890025 -6.326336  -1.0471658 -5.862519 ]], shape=(1, 6), dtype=float32) tf.Tensor([[-6.617558  10.580135  -3.0400262 -6.439238  -1.0625697 -5.959219 ]], shape=(1, 6), dtype=float32)
tf.Tensor([[-6.4971676 10.395621  -2.9890025 -6.326336  -1.0471658 -5.862519 ]], shape=(1, 6), dtype=float32) tf.Tensor([[-6.617558  10.580135  -3.0400262 -6.439238  -1.0625697 -5.959219 ]], shape=(1, 6), dtype=float32)
tf.Tensor([[-6.4971676 10.395621  -2.9890025 -6.326336  -1.0471658 -5.862519 ]], shape=(1, 6), dtype=float32) tf.Tensor(

  0%|          | 0/100 [00:59<?, ?it/s]


KeyboardInterrupt: 