In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import os
import sys

In [2]:
from IPython.display import clear_output
from time import sleep

In [3]:
#addition module dir to path
module_dir = os.path.split( os.getcwd() )[:-1][0]
sys.path.append(module_dir)

In [4]:
from src.envs.wh_env import WarehouseEnv

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

In [7]:
src_dir = os.getcwd()
data_dir = '/Users/albelyakov/Data/rl_warehouse'
models_dir = os.path.join(module_dir, 'models')

In [8]:
from src.envs import wh_map as wm
from src.envs import wh_objects as wo

In [9]:
import subprocess
import readline

In [57]:
from keras.optimizers import Adam
from collections import Counter, deque
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LeakyReLU
from keras.layers import Conv2D, MaxPooling2D
from keras.optimizers import RMSprop
from keras import backend as K
from keras.callbacks import ReduceLROnPlateau

import tensorflow as tf

from rl.agents.dqn import DQNAgent
from rl.core import Processor
from rl.policy import BoltzmannQPolicy, LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import ModelIntervalCheckpoint, FileLogger

In [11]:
K.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [33]:
def build_model(input_shape, n_actions):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3),
                     activation='relu',
                     input_shape=input_shape))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(128))
    model.add(LeakyReLU(alpha=0.15))
    model.add(Dropout(0.2))
    model.add(Dense(n_actions, activation='linear'))
    model.compile(Adam(lr=1e-3), 'mae', batch_size=256)
    return model

In [15]:
class CustomProcessor(Processor):
    '''
    acts as a coupling mechanism between the agent and the environment
    '''

    def process_state_batch(self, batch):
        '''
        Given a state batch, I want to remove the second dimension, because it's
        useless and prevents me from feeding the tensor into my CNN
        '''
        return np.squeeze(batch, axis=1)

In [60]:
def build_callbacks(env_name):
    checkpoint_weights_filename = 'callbacks/'+'dqn_' + env_name + '_weights_{step}.h5f'
    log_filename = 'dqn_{}_log.json'.format(env_name)
    callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=50000)]
    callbacks += [FileLogger(log_filename, interval=100)]
#     callbacks += [ReduceLROnPlateau(
#         monitor='val_acc', patience=5, verbose=1, factor=0.5, min_lr=1e-4
#     )]
    return callbacks

In [40]:
env = WarehouseEnv(
    map_sketch=wm.wh_vis_map, 
    catalog=None, 
    num_turns=2000, 
    max_order_line=None,
    agent_max_load=200, 
    agent_max_volume=1000,
    agent_start_pos=(18, 9),
    shelf_max_load=200, 
    shelf_max_volume=100,
    frequency=0.05, 
    simplified_state=False,
    only_one_product= True, 
    win_size=(300, 300), 
    silent=True
)

In [35]:
model = build_model(input_shape=(23,20,1), n_actions=env.action_space.n)

In [36]:
model.load_weights('dqn_rl-test_weights_350000.h5f')

In [41]:
processor = CustomProcessor()

In [47]:
memory = SequentialMemory(limit=50000, window_length=1)
# policy = BoltzmannQPolicy()
policy = LinearAnnealedPolicy(
    EpsGreedyQPolicy(), 
    attr='eps', 
    value_max=.8, 
    value_min=.1, 
    value_test=.05, 
    nb_steps=500000
)
dqn = DQNAgent(
    model=model, 
    nb_actions=env.action_space.n, 
    memory=memory, 
    nb_steps_warmup=256,
    target_model_update=1e-2, 
    policy=policy, 
    processor=processor, 
    batch_size=512
)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [61]:
callbacks = build_callbacks('rl-run')

In [None]:
%%time
hist = dqn.fit(env, nb_steps=500000, visualize=True, verbose=2, callbacks=callbacks)

Training for 500000 steps ...
   2000/500000: episode: 1, duration: 68.827s, episode steps: 2000, steps per second: 29, episode reward: -8246.000, mean reward: -4.123 [-10.000, 0.000], mean action: 3.154 [0.000, 6.000], mean observation: 56.837 [0.000, 255.000], loss: 69.740472, mean_absolute_error: 60.195942, mean_q: 76.270905, mean_eps: 0.798421
   4000/500000: episode: 2, duration: 76.783s, episode steps: 2000, steps per second: 26, episode reward: -8876.000, mean reward: -4.438 [-10.000, 0.000], mean action: 2.884 [0.000, 6.000], mean observation: 54.902 [0.000, 255.000], loss: 83.409856, mean_absolute_error: 66.368887, mean_q: 84.891629, mean_eps: 0.795801
   6000/500000: episode: 3, duration: 76.668s, episode steps: 2000, steps per second: 26, episode reward: -8653.000, mean reward: -4.327 [-10.000, 0.000], mean action: 2.884 [0.000, 6.000], mean observation: 54.902 [0.000, 255.000], loss: 75.026836, mean_absolute_error: 81.907022, mean_q: 102.718053, mean_eps: 0.793001
   8000/5

In [53]:
env.close()