### 1. Imports

In [1]:
# Import libraries
import warnings
warnings.filterwarnings("ignore")
import gym
import numpy as np
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2, A2C
from matplotlib import pyplot as plt

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
# Import classes
from rl_environment import Environment

### Environment creation

In [3]:
# Parameters
number_of_regional_warehouses = 2
max_inventory_amount_regional_warehouses = 40
max_inventory_amount_central_warehouse = 100
customer_demand_per_step = [1,2,1]  # List needs one entry per regional warehouse 
simulation_length = 30
lead_time = 2
shipment_amount = 5

In [4]:
env = Environment(number_of_regional_wh = number_of_regional_warehouses, 
                  rw_inventory_limit = max_inventory_amount_regional_warehouses, 
                  cw_inventory_limit = max_inventory_amount_central_warehouse,
                  demand = customer_demand_per_step,
                  sim_length = simulation_length,
                  lead_time = lead_time,
                  shipment_amount = shipment_amount)

env.print_environment_information()

Simulation created with the following parameters:
________________________________________________________________________________
Simulation | Round 1
-------------------
-> Active shipments:
No active shipments

-> Warehouses:
central_warehouse ; Inventory: 33
regional_warehouse_1 ; ID: 1 ; Inventory: 13 ; Demand: 1 ; Lost sales: 0
regional_warehouse_2 ; ID: 2 ; Inventory: 13 ; Demand: 2 ; Lost sales: 0
________________________________________________________________________________
Environment Information
-----------------------
Observation space: MultiDiscrete([41 41])
Action space: MultiDiscrete([2 2])
Starting state: [13 13]
________________________________________________________________________________


### RL model training

###### Model 1: PPO
Proximal Policy Optimization
https://openai.com/blog/openai-baselines-ppo/

In [None]:
ppo_model = PPO2(MlpPolicy, env, verbose=1)
ppo_model.learn(total_timesteps=30000)

Wrapping the env in a DummyVecEnv.




Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Please use `layer.__call__` method instead.





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



--------------------------------------
| approxkl           | 2.2266217e-05 |
| clipfrac           | 0.0           |
| explained_variance | -0.123        |
| fps                | 367           |
| n_updates          | 1             |
| policy_entropy     | 1.3862704     |
| policy_loss        | -0.0019341036 |
| serial_timesteps   | 128           |
| time_elapsed       | 0             |
| total_timesteps    | 128           |
| value_loss         | 4.889826      |
--------------------------------------
--------------------------------------
| approxkl           | 2.8493436e-05 |
| clipfrac           | 0.0           |
| explained_variance | -0.0464       |
| fps                | 1281          |
| n_updates          | 2 

---------------------------------------
| approxkl           | 6.3880725e-05  |
| clipfrac           | 0.0            |
| explained_variance | -0.14          |
| fps                | 1314           |
| n_updates          | 10             |
| policy_entropy     | 1.384072       |
| policy_loss        | -0.00010627264 |
| serial_timesteps   | 1280           |
| time_elapsed       | 1.17           |
| total_timesteps    | 1280           |
| value_loss         | 3.0822582      |
---------------------------------------
--------------------------------------
| approxkl           | 6.300825e-05  |
| clipfrac           | 0.0           |
| explained_variance | -0.0732       |
| fps                | 1412          |
| n_updates          | 11            |
| policy_entropy     | 1.3836951     |
| policy_loss        | -0.0030692029 |
| serial_timesteps   | 1408          |
| time_elapsed       | 1.27          |
| total_timesteps    | 1408          |
| value_loss         | 1.5062683     |
------------

--------------------------------------
| approxkl           | 0.00020176839 |
| clipfrac           | 0.0           |
| explained_variance | 0.22          |
| fps                | 1289          |
| n_updates          | 27            |
| policy_entropy     | 1.3682339     |
| policy_loss        | -0.0025512653 |
| serial_timesteps   | 3456          |
| time_elapsed       | 2.76          |
| total_timesteps    | 3456          |
| value_loss         | 1.2413185     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00012406058 |
| clipfrac           | 0.0           |
| explained_variance | 0.486         |
| fps                | 1489          |
| n_updates          | 28            |
| policy_entropy     | 1.3646393     |
| policy_loss        | -0.0024825481 |
| serial_timesteps   | 3584          |
| time_elapsed       | 2.86          |
| total_timesteps    | 3584          |
| value_loss         | 0.8292726     |
-------------------------

--------------------------------------
| approxkl           | 0.00011551056 |
| clipfrac           | 0.0           |
| explained_variance | 0.547         |
| fps                | 1266          |
| n_updates          | 44            |
| policy_entropy     | 1.3275169     |
| policy_loss        | -0.0010967262 |
| serial_timesteps   | 5632          |
| time_elapsed       | 4.36          |
| total_timesteps    | 5632          |
| value_loss         | 0.65130484    |
--------------------------------------
--------------------------------------
| approxkl           | 0.00029494683 |
| clipfrac           | 0.0           |
| explained_variance | 0.467         |
| fps                | 1285          |
| n_updates          | 45            |
| policy_entropy     | 1.3324854     |
| policy_loss        | -0.0039170133 |
| serial_timesteps   | 5760          |
| time_elapsed       | 4.46          |
| total_timesteps    | 5760          |
| value_loss         | 0.8731039     |
-------------------------

--------------------------------------
| approxkl           | 0.00033442062 |
| clipfrac           | 0.0           |
| explained_variance | 0.313         |
| fps                | 1270          |
| n_updates          | 61            |
| policy_entropy     | 1.3087366     |
| policy_loss        | -0.0033536437 |
| serial_timesteps   | 7808          |
| time_elapsed       | 6             |
| total_timesteps    | 7808          |
| value_loss         | 0.8166224     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00024974527 |
| clipfrac           | 0.0           |
| explained_variance | -0.0151       |
| fps                | 1303          |
| n_updates          | 62            |
| policy_entropy     | 1.3130726     |
| policy_loss        | -0.0025029262 |
| serial_timesteps   | 7936          |
| time_elapsed       | 6.1           |
| total_timesteps    | 7936          |
| value_loss         | 1.4055517     |
-------------------------

-------------------------------------
| approxkl           | 0.0010161051 |
| clipfrac           | 0.0          |
| explained_variance | 0.257        |
| fps                | 1049         |
| n_updates          | 78           |
| policy_entropy     | 1.2530099    |
| policy_loss        | -0.004909116 |
| serial_timesteps   | 9984         |
| time_elapsed       | 7.73         |
| total_timesteps    | 9984         |
| value_loss         | 1.1349053    |
-------------------------------------
--------------------------------------
| approxkl           | 0.00033196702 |
| clipfrac           | 0.0           |
| explained_variance | 0.281         |
| fps                | 1106          |
| n_updates          | 79            |
| policy_entropy     | 1.2403455     |
| policy_loss        | -0.0011632224 |
| serial_timesteps   | 10112         |
| time_elapsed       | 7.85          |
| total_timesteps    | 10112         |
| value_loss         | 1.5637648     |
--------------------------------------

-------------------------------------
| approxkl           | 0.0006372405 |
| clipfrac           | 0.0          |
| explained_variance | 0.272        |
| fps                | 1050         |
| n_updates          | 95           |
| policy_entropy     | 1.0999683    |
| policy_loss        | -0.005563882 |
| serial_timesteps   | 12160        |
| time_elapsed       | 10           |
| total_timesteps    | 12160        |
| value_loss         | 1.2624168    |
-------------------------------------
---------------------------------------
| approxkl           | 0.00017371119  |
| clipfrac           | 0.0            |
| explained_variance | 0.345          |
| fps                | 1282           |
| n_updates          | 96             |
| policy_entropy     | 1.13131        |
| policy_loss        | -0.00069355615 |
| serial_timesteps   | 12288          |
| time_elapsed       | 10.1           |
| total_timesteps    | 12288          |
| value_loss         | 1.0350087      |
--------------------------

--------------------------------------
| approxkl           | 4.3705026e-05 |
| clipfrac           | 0.0           |
| explained_variance | 0.202         |
| fps                | 1233          |
| n_updates          | 112           |
| policy_entropy     | 1.1889125     |
| policy_loss        | -0.0011498544 |
| serial_timesteps   | 14336         |
| time_elapsed       | 12.1          |
| total_timesteps    | 14336         |
| value_loss         | 1.2993476     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0003866016  |
| clipfrac           | 0.0           |
| explained_variance | 0.291         |
| fps                | 1104          |
| n_updates          | 113           |
| policy_entropy     | 1.2004758     |
| policy_loss        | -0.0025347956 |
| serial_timesteps   | 14464         |
| time_elapsed       | 12.2          |
| total_timesteps    | 14464         |
| value_loss         | 1.2637326     |
-------------------------

--------------------------------------
| approxkl           | 0.00024489406 |
| clipfrac           | 0.0           |
| explained_variance | 0.325         |
| fps                | 786           |
| n_updates          | 129           |
| policy_entropy     | 1.1914282     |
| policy_loss        | -0.004459976  |
| serial_timesteps   | 16512         |
| time_elapsed       | 13.8          |
| total_timesteps    | 16512         |
| value_loss         | 1.2704209     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00025785872 |
| clipfrac           | 0.0           |
| explained_variance | 0.329         |
| fps                | 1497          |
| n_updates          | 130           |
| policy_entropy     | 1.2045563     |
| policy_loss        | -3.091758e-05 |
| serial_timesteps   | 16640         |
| time_elapsed       | 13.9          |
| total_timesteps    | 16640         |
| value_loss         | 1.1528748     |
-------------------------

--------------------------------------
| approxkl           | 0.00065529475 |
| clipfrac           | 0.005859375   |
| explained_variance | 0.181         |
| fps                | 1557          |
| n_updates          | 146           |
| policy_entropy     | 1.2186254     |
| policy_loss        | -0.008459974  |
| serial_timesteps   | 18688         |
| time_elapsed       | 15.4          |
| total_timesteps    | 18688         |
| value_loss         | 1.8177905     |
--------------------------------------
--------------------------------------
| approxkl           | 0.00042315098 |
| clipfrac           | 0.0           |
| explained_variance | 0.256         |
| fps                | 1268          |
| n_updates          | 147           |
| policy_entropy     | 1.2482512     |
| policy_loss        | -0.005111173  |
| serial_timesteps   | 18816         |
| time_elapsed       | 15.5          |
| total_timesteps    | 18816         |
| value_loss         | 1.4789084     |
-------------------------

--------------------------------------
| approxkl           | 0.00017144566 |
| clipfrac           | 0.0           |
| explained_variance | 0.375         |
| fps                | 1279          |
| n_updates          | 163           |
| policy_entropy     | 1.191437      |
| policy_loss        | -0.0022076068 |
| serial_timesteps   | 20864         |
| time_elapsed       | 17            |
| total_timesteps    | 20864         |
| value_loss         | 0.8468956     |
--------------------------------------
--------------------------------------
| approxkl           | 0.0001703097  |
| clipfrac           | 0.0           |
| explained_variance | 0.369         |
| fps                | 1535          |
| n_updates          | 164           |
| policy_entropy     | 1.1954883     |
| policy_loss        | -0.0035842299 |
| serial_timesteps   | 20992         |
| time_elapsed       | 17.1          |
| total_timesteps    | 20992         |
| value_loss         | 0.8859416     |
-------------------------

###### Learning Curve

In [None]:
plt.plot(env.total_reward)
plt.show()

### Run simulation

In [None]:
# Reset environment for simulation
state = env.reset()
done = False

# Graph data
inventory_levels = {}
for rw_id in env.simulation.get_regional_warehouses():
    inventory_levels[rw_id] = []

# Run simulation with model
while not done:
    action, _states = ppo_model.predict(state)
    state, reward, done, info = env.step(action)
    
    # For Graph
    i = 1
    for inv in info["Inventory:"]:
        inventory_levels[i].append(inv)
        i += 1
    
    # print(env.simulation.print_state())
    for i in info:
        print(i, info[i], end=" | ")
    print()
    
print("\n", env.evaluation_parameters())

###### Inventory level graph

In [None]:
for rw_id in inventory_levels:
    print("Regional warehouse", rw_id)
    plt.plot(inventory_levels[rw_id])
    plt.show()