# Bin Packing Problem (BPP)

Comparing RL agent vs Ensemble of agents for solving BPP


In [4]:
import numpy as np
from rl_factory import rl_agent_factory, default_hyperparam_factory  # Factory to build RL agents name:stirng->model:rllib
from BinPacking.EnvBinPacking import EnvBinPacking, inv_prepro_state, inv_prepro_reward
from AgentRL import AgentRLLIB
from EnsembleAgentRL import EnsembleAgentRL


# Environment
env_config = {"action_type": "discrete"} # PPO implementation fits {"continuous" or "discrete"}. DQN "discrete". DDPG "continuous".
env_class=EnvBinPacking # ptr on the environment class (not an OOP object)

## Reinforcement Learning

In [5]:
# build the RL agent
rl_name = "PPO" 
hyperparameters = default_hyperparam_factory(rl_name)

# update value similar to the publication
hyperparameters["lr"]=1e-4 # <---Sensisitive between slow-smooth and fast-chaotic . Original paper uses 3e-4.
hyperparameters["deep"]=2
hyperparameters["wide"]=16
hyperparameters["train_batch_size"]=64
hyperparameters["sgd_minibatch_size"]=64
hyperparameters["lambda"]=0.99
hyperparameters["grad_clip"]=0.3
hyperparameters["num_rollout_workers"]=4

# Build the Trainer (contains RL object and Environment simulator object)
rllib_trainer = rl_agent_factory(rl_name, hyperparameters, env_class, env_config=env_config)

2023-02-04 15:35:49,061	INFO worker.py:1538 -- Started a local Ray instance.
2023-02-04 15:35:59,431	INFO trainable.py:172 -- Trainable.setup took 13.224 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


## Agents training

We train `nb_agents` and store their best version during the training step based on regular intermediate evaluation step.

In [9]:
nb_agents = 4
episodes = 1000
eval_every = 100
eval_times = 30


In [8]:

AGENTS_INFO = dict()  # {id -> {"path"->path:str, "score"->score:int, "history"->hist:List[int]}  }

for a in range(nb_agents):
    agent = AgentRLLIB(rllib_trainer, env_class, env_config,
                       inv_prepro_state=inv_prepro_state, inv_prepro_reward=inv_prepro_reward)

    agent_info = {"folder": "/tmp/ensemble/agent" + str(a) + "/",
                  "checkpoint_path": "undefined",
                  "score": -np.inf,
                  "history": []}

    print(f"Training : {a}")
    for i in range(episodes):
        agent.train()

        if i % eval_every == 0:
            score = np.round(np.mean([agent.evaluate()["cumulated_rewards"] for i in range(eval_times)]), 2)
            agent_info["history"].append(score)

            # better score found
            if score >= agent_info["score"]:
                agent_info["score"] = score
                agent_info["checkpoint_path"] = agent.save(agent_info["folder"])

    AGENTS_INFO[a] = agent_info


Training : 0
Training : 1
Training : 2
Training : 3


## Ensemble

Ensemble construction for boosting cumulated rewards

In [11]:
ensemble = []
for agent_info in AGENTS_INFO.values():
    # build an empty RL agent
    agent = AgentRLLIB(rllib_trainer, env_class, env_config,
                       inv_prepro_state=inv_prepro_state, inv_prepro_reward=inv_prepro_reward)

    # restore weights
    agent.restore(agent_info["checkpoint_path"])

    # save it
    ensemble.append(agent)


2023-02-04 15:36:25,634	INFO trainable.py:790 -- Restored on 192.168.1.100 from checkpoint: /tmp/ensemble/agent0/checkpoint_002401
2023-02-04 15:36:25,635	INFO trainable.py:799 -- Current state after restoring: {'_iteration': 2401, '_timesteps_total': None, '_time_total': 459.4775776863098, '_episodes_total': 3790}
2023-02-04 15:36:25,681	INFO trainable.py:790 -- Restored on 192.168.1.100 from checkpoint: /tmp/ensemble/agent1/checkpoint_015101
2023-02-04 15:36:25,682	INFO trainable.py:799 -- Current state after restoring: {'_iteration': 15101, '_timesteps_total': None, '_time_total': 2812.099847793579, '_episodes_total': 13058}
2023-02-04 15:36:25,727	INFO trainable.py:790 -- Restored on 192.168.1.100 from checkpoint: /tmp/ensemble/agent2/checkpoint_025501
2023-02-04 15:36:25,729	INFO trainable.py:799 -- Current state after restoring: {'_iteration': 25501, '_timesteps_total': None, '_time_total': 4752.843599319458, '_episodes_total': 20088}
2023-02-04 15:36:25,771	INFO trainable.py:790

## Test agents and ensemble

In [13]:
for a, agent in enumerate(ensemble):
    score = np.round(np.mean([agent.evaluate()["cumulated_rewards"] for i in range(eval_times)]), 2)
    print(f"Test agent {a} score: {score}")

ensemble_agent = EnsembleAgentRL(ensemble)
score = np.round(np.mean([ensemble_agent.evaluate()["cumulated_rewards"] for i in range(eval_times)]), 2)
print("Test ensemble score:", score)


Test agent 0 score: -515.33
Test agent 1 score: -492.0
Test agent 2 score: -510.6
Test agent 3 score: -533.9
Test ensemble score: -487.03


Conclusion: Ensemble of RL agents is an easy way to boost cumulated rewards but multiply the computing time at both training and inference time