In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import os
import sys

In [2]:
from IPython.display import clear_output
from time import sleep

In [3]:
#addition module dir to path
module_dir = os.path.split( os.getcwd() )[:-1][0]
sys.path.append(module_dir)

In [4]:
from src.envs.wh_env import WarehouseEnv

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

In [7]:
src_dir = os.getcwd()
data_dir = '/Users/albelyakov/Data/rl_warehouse'
models_dir = os.path.join(module_dir, 'models')

In [8]:
from src.envs import wh_map as wm
from src.envs import wh_objects as wo

In [9]:
import subprocess
import readline

In [10]:
from keras.optimizers import Adam
from collections import Counter, deque
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LeakyReLU
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import RMSprop
from keras import backend as K
from keras.callbacks import ReduceLROnPlateau

import tensorflow as tf

from rl.agents.dqn import DQNAgent
from rl.core import Processor
from rl.policy import BoltzmannQPolicy, LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import ModelIntervalCheckpoint, FileLogger

Using TensorFlow backend.


In [11]:
K.tensorflow_backend._get_available_gpus()

W0722 11:43:29.604804 4395738560 deprecation_wrapper.py:119] From /anaconda3/envs/reinforcement_learning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0722 11:43:29.605510 4395738560 deprecation_wrapper.py:119] From /anaconda3/envs/reinforcement_learning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0722 11:43:29.606575 4395738560 deprecation_wrapper.py:119] From /anaconda3/envs/reinforcement_learning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:186: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.

W0722 11:43:29.788007 4395738560 deprecation_wrapper.py:119] From /anaconda3/envs/reinforcement_learning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:190: The name tf.global_variables is deprecate

[]

In [18]:
def build_model(input_shape, n_actions):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(5, 5),
                     activation='relu',
                     input_shape=input_shape))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.15))
    model.add(Dropout(0.2))
    model.add(Dense(n_actions, activation='linear'))
    model.compile(Adam(lr=1e-3), 'mae', batch_size=256)
    return model

In [19]:
class CustomProcessor(Processor):
    '''
    acts as a coupling mechanism between the agent and the environment
    '''

    def process_state_batch(self, batch):
        '''
        Given a state batch, I want to remove the second dimension, because it's
        useless and prevents me from feeding the tensor into my CNN
        '''
        return np.squeeze(batch, axis=1)

In [20]:
def build_callbacks(env_name):
    checkpoint_weights_filename = 'callbacks/'+'dqn_' + env_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(env_name)
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=50000)]
    callbacks += [FileLogger(log_filename, interval=100)]
#     callbacks += [ReduceLROnPlateau(
#         monitor='val_acc', patience=5, verbose=1, factor=0.5, min_lr=1e-4
#     )]
    return callbacks

In [21]:
env = WarehouseEnv(
    map_sketch=wm.wh_vis_map, 
    catalog=None, 
    num_turns=2000, 
    max_order_line=None,
    agent_max_load=200, 
    agent_max_volume=1000,
    agent_start_pos=(18, 9),
    shelf_max_load=200, 
    shelf_max_volume=100,
    frequency=0.05, 
    simplified_state=False,
    only_one_product= True, 
    win_size=(300, 200), 
    silent=True
)

In [22]:
model = build_model(input_shape=(23,20,1), n_actions=env.action_space.n)

W0722 11:58:27.372644 4395738560 deprecation_wrapper.py:119] From /anaconda3/envs/reinforcement_learning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0722 11:58:27.431739 4395738560 deprecation_wrapper.py:119] From /anaconda3/envs/reinforcement_learning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0722 11:58:27.439254 4395738560 deprecation.py:506] From /anaconda3/envs/reinforcement_learning/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0722 11:58:27.532946 4395738560 deprecation_wrapper.py:119] From /anacond

In [17]:
model.load_weights('callbacks/dqn_rl-run_weights_350000.h5f')

In [23]:
processor = CustomProcessor()

In [24]:
memory = SequentialMemory(limit=500000, window_length=4)
# policy = BoltzmannQPolicy()
policy = LinearAnnealedPolicy(
    EpsGreedyQPolicy(), 
    attr='eps', 
    value_max=.8, 
    value_min=.1, 
    value_test=.05, 
    nb_steps=500000
)
dqn = DQNAgent(
    model=model, 
    nb_actions=env.action_space.n, 
    memory=memory, 
    nb_steps_warmup=512,
    target_model_update=1e-2, 
    policy=policy, 
    processor=processor, 
    batch_size=512
)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [25]:
callbacks = build_callbacks('rl-run')

In [26]:
%%time
hist = dqn.fit(env, nb_steps=500000, visualize=True, verbose=2, callbacks=callbacks)

Training for 500000 steps ...
done, took 218.823 seconds
CPU times: user 9min 49s, sys: 1min 15s, total: 11min 4s
Wall time: 3min 38s


In [27]:
env.close()