### 1. Imports

In [1]:
# Import libraries
import gym
import numpy as np
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2, A2C

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



  "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation."


In [2]:
# Import classes
from rl_environment import Environment

### Environment creation

In [9]:
# Parameters
number_of_regional_warehouses = 1
max_inventory_amount_regional_warehouses = 40
max_inventory_amount_central_warehouse = 100
customer_demand = 1
simulation_length = 30
lead_time = 4

In [10]:
env = Environment(number_of_regional_wh = number_of_regional_warehouses, 
                  rw_inventory_limit = max_inventory_amount_regional_warehouses, 
                  cw_inventory_limit = max_inventory_amount_central_warehouse,
                  demand = customer_demand,
                  sim_length = simulation_length,
                  lead_time = lead_time)

Simulation created with the following parameters:
------------------------------------------------------------
central_warehouse ; Inventory: 33
regional_warehouse_2 ; ID: 1 ; Inventory: 13 ; Demand: 1 ; Lost sales: 0
------------------------------------------------------------


### RL model training

###### Model 1: PPO
Proximal Policy Optimization
https://openai.com/blog/openai-baselines-ppo/

In [11]:
ppo_model = PPO2(MlpPolicy, env, verbose=1)
ppo_model.learn(total_timesteps=30000)

Wrapping the env in a DummyVecEnv.
--------------------------------------
| approxkl           | 4.8937877e-06 |
| clipfrac           | 0.0           |
| explained_variance | -1.12         |
| fps                | 509           |
| n_updates          | 1             |
| policy_entropy     | 0.693136      |
| policy_loss        | -0.0007821121 |
| serial_timesteps   | 128           |
| time_elapsed       | 0             |
| total_timesteps    | 128           |
| value_loss         | 0.072028935   |
--------------------------------------
--------------------------------------
| approxkl           | 1.3614601e-06 |
| clipfrac           | 0.0           |
| explained_variance | -0.819        |
| fps                | 1256          |
| n_updates          | 2             |
| policy_entropy     | 0.6931186     |
| policy_loss        | -0.0003559636 |
| serial_timesteps   | 256           |
| time_elapsed       | 0.251         |
| total_timesteps    | 256           |
| value_loss         | 0.0738

--------------------------------------
| approxkl           | 2.8985547e-05 |
| clipfrac           | 0.0           |
| explained_variance | 0.636         |
| fps                | 1297          |
| n_updates          | 18            |
| policy_entropy     | 0.6912929     |
| policy_loss        | -0.000920894  |
| serial_timesteps   | 2304          |
| time_elapsed       | 1.78          |
| total_timesteps    | 2304          |
| value_loss         | 0.016827656   |
--------------------------------------
---------------------------------------
| approxkl           | 1.7801327e-05  |
| clipfrac           | 0.0            |
| explained_variance | 0.55           |
| fps                | 1554           |
| n_updates          | 19             |
| policy_entropy     | 0.6911529      |
| policy_loss        | -0.00056647474 |
| serial_timesteps   | 2432           |
| time_elapsed       | 1.88           |
| total_timesteps    | 2432           |
| value_loss         | 0.018093413    |
-------------

--------------------------------------
| approxkl           | 0.00046074277 |
| clipfrac           | 0.0           |
| explained_variance | 0.187         |
| fps                | 1527          |
| n_updates          | 35            |
| policy_entropy     | 0.5693523     |
| policy_loss        | -0.0045876927 |
| serial_timesteps   | 4480          |
| time_elapsed       | 3.28          |
| total_timesteps    | 4480          |
| value_loss         | 0.1228664     |
--------------------------------------
---------------------------------------
| approxkl           | 5.826445e-05   |
| clipfrac           | 0.0            |
| explained_variance | 0.0781         |
| fps                | 1313           |
| n_updates          | 36             |
| policy_entropy     | 0.5513649      |
| policy_loss        | 0.000118587865 |
| serial_timesteps   | 4608           |
| time_elapsed       | 3.36           |
| total_timesteps    | 4608           |
| value_loss         | 0.1308429      |
-------------

--------------------------------------
| approxkl           | 7.1221846e-05 |
| clipfrac           | 0.0           |
| explained_variance | 0.32          |
| fps                | 1523          |
| n_updates          | 52            |
| policy_entropy     | 0.4327804     |
| policy_loss        | -0.001989103  |
| serial_timesteps   | 6656          |
| time_elapsed       | 5.04          |
| total_timesteps    | 6656          |
| value_loss         | 0.48860615    |
--------------------------------------
--------------------------------------
| approxkl           | 6.098144e-05  |
| clipfrac           | 0.0           |
| explained_variance | 0.125         |
| fps                | 1568          |
| n_updates          | 53            |
| policy_entropy     | 0.42636144    |
| policy_loss        | -0.0013482161 |
| serial_timesteps   | 6784          |
| time_elapsed       | 5.13          |
| total_timesteps    | 6784          |
| value_loss         | 0.3907088     |
-------------------------

--------------------------------------
| approxkl           | 0.00048601071 |
| clipfrac           | 0.0           |
| explained_variance | 0.0484        |
| fps                | 1535          |
| n_updates          | 69            |
| policy_entropy     | 0.45302907    |
| policy_loss        | -0.0049982443 |
| serial_timesteps   | 8832          |
| time_elapsed       | 6.48          |
| total_timesteps    | 8832          |
| value_loss         | 0.30709538    |
--------------------------------------
--------------------------------------
| approxkl           | 6.909664e-05  |
| clipfrac           | 0.0           |
| explained_variance | 0.149         |
| fps                | 1534          |
| n_updates          | 70            |
| policy_entropy     | 0.46953064    |
| policy_loss        | -0.0005834373 |
| serial_timesteps   | 8960          |
| time_elapsed       | 6.56          |
| total_timesteps    | 8960          |
| value_loss         | 0.35532084    |
-------------------------

--------------------------------------
| approxkl           | 0.00011010439 |
| clipfrac           | 0.0           |
| explained_variance | -0.0131       |
| fps                | 1537          |
| n_updates          | 86            |
| policy_entropy     | 0.37641856    |
| policy_loss        | -0.0012535788 |
| serial_timesteps   | 11008         |
| time_elapsed       | 8.09          |
| total_timesteps    | 11008         |
| value_loss         | 0.49461865    |
--------------------------------------
---------------------------------------
| approxkl           | 5.199725e-05   |
| clipfrac           | 0.0            |
| explained_variance | 0.389          |
| fps                | 1533           |
| n_updates          | 87             |
| policy_entropy     | 0.3975405      |
| policy_loss        | -0.00097046257 |
| serial_timesteps   | 11136          |
| time_elapsed       | 8.18           |
| total_timesteps    | 11136          |
| value_loss         | 2.40179        |
-------------

--------------------------------------
| approxkl           | 0.00026721245 |
| clipfrac           | 0.0           |
| explained_variance | -0.149        |
| fps                | 1299          |
| n_updates          | 103           |
| policy_entropy     | 0.3404501     |
| policy_loss        | -0.0028805449 |
| serial_timesteps   | 13184         |
| time_elapsed       | 9.56          |
| total_timesteps    | 13184         |
| value_loss         | 0.8766091     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00029998852 |
| clipfrac           | 0.0           |
| explained_variance | -0.103        |
| fps                | 1536          |
| n_updates          | 104           |
| policy_entropy     | 0.31182605    |
| policy_loss        | -0.003875536  |
| serial_timesteps   | 13312         |
| time_elapsed       | 9.66          |
| total_timesteps    | 13312         |
| value_loss         | 0.5506153     |
-------------------------

--------------------------------------
| approxkl           | 0.00027320656 |
| clipfrac           | 0.0           |
| explained_variance | 0.0354        |
| fps                | 1546          |
| n_updates          | 120           |
| policy_entropy     | 0.28333938    |
| policy_loss        | -0.001939134  |
| serial_timesteps   | 15360         |
| time_elapsed       | 11            |
| total_timesteps    | 15360         |
| value_loss         | 0.8100185     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00022085635 |
| clipfrac           | 0.0           |
| explained_variance | -0.0343       |
| fps                | 1278          |
| n_updates          | 121           |
| policy_entropy     | 0.23858824    |
| policy_loss        | -0.0038718348 |
| serial_timesteps   | 15488         |
| time_elapsed       | 11.1          |
| total_timesteps    | 15488         |
| value_loss         | 0.47564122    |
-------------------------

---------------------------------------
| approxkl           | 6.835477e-05   |
| clipfrac           | 0.0            |
| explained_variance | 0.187          |
| fps                | 1360           |
| n_updates          | 137            |
| policy_entropy     | 0.28684562     |
| policy_loss        | -0.00031041785 |
| serial_timesteps   | 17536          |
| time_elapsed       | 12.5           |
| total_timesteps    | 17536          |
| value_loss         | 0.7656888      |
---------------------------------------
--------------------------------------
| approxkl           | 0.0001490748  |
| clipfrac           | 0.0           |
| explained_variance | 0.0827        |
| fps                | 1754          |
| n_updates          | 138           |
| policy_entropy     | 0.2745923     |
| policy_loss        | -0.0022216877 |
| serial_timesteps   | 17664         |
| time_elapsed       | 12.6          |
| total_timesteps    | 17664         |
| value_loss         | 0.94021565    |
------------

--------------------------------------
| approxkl           | 0.00038920227 |
| clipfrac           | 0.0           |
| explained_variance | 0.202         |
| fps                | 1286          |
| n_updates          | 154           |
| policy_entropy     | 0.2031443     |
| policy_loss        | -0.0018237855 |
| serial_timesteps   | 19712         |
| time_elapsed       | 14            |
| total_timesteps    | 19712         |
| value_loss         | 0.87623364    |
--------------------------------------
-------------------------------------
| approxkl           | 0.0011088944 |
| clipfrac           | 0.015625     |
| explained_variance | 0.0614       |
| fps                | 1556         |
| n_updates          | 155          |
| policy_entropy     | 0.20839386   |
| policy_loss        | -0.009979036 |
| serial_timesteps   | 19840        |
| time_elapsed       | 14.1         |
| total_timesteps    | 19840        |
| value_loss         | 0.8082024    |
-------------------------------------

---------------------------------------
| approxkl           | 0.000103115024 |
| clipfrac           | 0.0            |
| explained_variance | 0.0509         |
| fps                | 1538           |
| n_updates          | 171            |
| policy_entropy     | 0.18290274     |
| policy_loss        | -0.00071523944 |
| serial_timesteps   | 21888          |
| time_elapsed       | 15.5           |
| total_timesteps    | 21888          |
| value_loss         | 0.7798482      |
---------------------------------------
--------------------------------------
| approxkl           | 0.00013436082 |
| clipfrac           | 0.0           |
| explained_variance | 0.00346       |
| fps                | 1535          |
| n_updates          | 172           |
| policy_entropy     | 0.145385      |
| policy_loss        | -0.0017772621 |
| serial_timesteps   | 22016         |
| time_elapsed       | 15.5          |
| total_timesteps    | 22016         |
| value_loss         | 0.7371157     |
------------

---------------------------------------
| approxkl           | 2.1410931e-05  |
| clipfrac           | 0.0            |
| explained_variance | 0.0707         |
| fps                | 1276           |
| n_updates          | 188            |
| policy_entropy     | 0.13751912     |
| policy_loss        | -0.00050417974 |
| serial_timesteps   | 24064          |
| time_elapsed       | 16.9           |
| total_timesteps    | 24064          |
| value_loss         | 1.0998         |
---------------------------------------
---------------------------------------
| approxkl           | 3.9142953e-05  |
| clipfrac           | 0.0            |
| explained_variance | 0.0892         |
| fps                | 1543           |
| n_updates          | 189            |
| policy_entropy     | 0.14094262     |
| policy_loss        | -0.00037526723 |
| serial_timesteps   | 24192          |
| time_elapsed       | 17             |
| total_timesteps    | 24192          |
| value_loss         | 0.88594365     |


---------------------------------------
| approxkl           | 0.0001209105   |
| clipfrac           | 0.0            |
| explained_variance | -0.0515        |
| fps                | 1562           |
| n_updates          | 204            |
| policy_entropy     | 0.110831045    |
| policy_loss        | -3.4661498e-06 |
| serial_timesteps   | 26112          |
| time_elapsed       | 18.3           |
| total_timesteps    | 26112          |
| value_loss         | 1.2536744      |
---------------------------------------
---------------------------------------
| approxkl           | 1.437671e-05   |
| clipfrac           | 0.0            |
| explained_variance | -0.0685        |
| fps                | 1536           |
| n_updates          | 205            |
| policy_entropy     | 0.12755369     |
| policy_loss        | -6.4834254e-05 |
| serial_timesteps   | 26240          |
| time_elapsed       | 18.4           |
| total_timesteps    | 26240          |
| value_loss         | 1.2106018      |


---------------------------------------
| approxkl           | 1.721903e-05   |
| clipfrac           | 0.0            |
| explained_variance | -0.0102        |
| fps                | 1099           |
| n_updates          | 221            |
| policy_entropy     | 0.10653884     |
| policy_loss        | -0.00021311635 |
| serial_timesteps   | 28288          |
| time_elapsed       | 19.8           |
| total_timesteps    | 28288          |
| value_loss         | 1.1961932      |
---------------------------------------
--------------------------------------
| approxkl           | 0.00012531885 |
| clipfrac           | 0.0           |
| explained_variance | 0.11          |
| fps                | 1291          |
| n_updates          | 222           |
| policy_entropy     | 0.11013356    |
| policy_loss        | -0.0030136628 |
| serial_timesteps   | 28416         |
| time_elapsed       | 19.9          |
| total_timesteps    | 28416         |
| value_loss         | 0.9462442     |
------------

<stable_baselines.ppo2.ppo2.PPO2 at 0x17e69ebd9b0>

###### Model 2: A2C

In [6]:
"""
a2c_model = PPO2(MlpPolicy, env, verbose=1)
a2c_model.learn(total_timesteps=30000)
"""

'\na2c_model = PPO2(MlpPolicy, env, verbose=1)\na2c_model.learn(total_timesteps=30000)\n'

### Run simulation

In [12]:
# Reset environment for simulation
state = env.reset()
done = False

# Run simulation with model
while not done:
    action, _states = ppo_model.predict(state)
    state, reward, done, info = env.step(action)
    
    for i in info:
        print(i, info[i], end=" | ")
    print()

Steps left: 29 | Inventory: 12 | Action: 0 | Reward: 0.17 | 
Steps left: 28 | Inventory: 11 | Action: 0 | Reward: 0.08 | 
Steps left: 27 | Inventory: 10 | Action: 0 | Reward: 0.09 | 
Steps left: 26 | Inventory: 9 | Action: 0 | Reward: 0.1 | 
Steps left: 25 | Inventory: 8 | Action: 0 | Reward: 0.11 | 
Steps left: 24 | Inventory: 7 | Action: 0 | Reward: 0.12 | 
Steps left: 23 | Inventory: 6 | Action: 0 | Reward: 0.14 | 
Steps left: 22 | Inventory: 5 | Action: 1 | Reward: 0.17 | 
Steps left: 21 | Inventory: 4 | Action: 1 | Reward: 0.2 | 
Steps left: 20 | Inventory: 3 | Action: 1 | Reward: 0.25 | 
Steps left: 19 | Inventory: 2 | Action: 0 | Reward: 0.33 | 
Steps left: 18 | Inventory: 6 | Action: 0 | Reward: 0.5 | 
Steps left: 17 | Inventory: 10 | Action: 0 | Reward: 0.17 | 
Steps left: 16 | Inventory: 14 | Action: 0 | Reward: 0.1 | 
Steps left: 15 | Inventory: 13 | Action: 0 | Reward: 0.07 | 
Steps left: 14 | Inventory: 12 | Action: 0 | Reward: 0.08 | 
Steps left: 13 | Inventory: 11 | Acti

In [8]:
"""
# Reset environment for simulation
state = env.reset()
done = False

# Run simulation with model
while not done:
    action, _states = a2c_model.predict(state)
    state, reward, done, info = env.step(action)
    print(info)
    # env.simulation.print_state()
""" 

'\n# Reset environment for simulation\nstate = env.reset()\ndone = False\n\n# Run simulation with model\nwhile not done:\n    action, _states = a2c_model.predict(state)\n    state, reward, done, info = env.step(action)\n    print(info)\n    # env.simulation.print_state()\n'