In [1]:
import numpy as np 
import pandas as pd 

import os
import sys

In [2]:
#addition module dir to path
module_dir = os.path.split( os.getcwd() )[:-1][0]
sys.path.append(module_dir)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 5000)

In [5]:
src_dir = os.getcwd()
data_dir = '/Users/albelyakov/Data/rl_warehouse'

#### Working with map

In [6]:
from src.envs import wh_map as wm
from src.envs import wh_objects as wo

In [7]:
import subprocess
import readline

In [8]:
def render_map(map_obj, agent_obj):
#     os.system('clear')
#     subprocess.call('reset')
    readline.clear_history()
    for i, row in enumerate(map_obj):
        to_print = list()
        for j, obj in enumerate(row):
            if (i, j) == agent_obj.coordinates:
                to_print.append(agent_obj.sprite)
            else:
                to_print.append(obj.sprite)
        print(''.join(to_print))

In [33]:
map_obj = wm.init_wh_map(wm.wh_vis_map)

In [18]:
render_map(map_obj)

++++++++++++++++++++
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+..................+
+..................+
+.#######..#######.+
+..................+
+..................+
+.#######..#######.+
+..................+
+..................+
+.##..##..##..##..#+
+.##..##..##..##..#+
+.##..##..##..##..#+
+.##..##..##..##..#+
+..................+
+..................+
+..................+
+$$$$$$$$$$$$$$$$$$+


In [34]:
agent = wo.Agent(
    coordinates=(18,9)
)

In [35]:
render_map(map_obj, agent)

++++++++++++++++++++
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+..................+
+..................+
+.#######..#######.+
+..................+
+..................+
+.#######..#######.+
+..................+
+..................+
+.##..##..##..##..#+
+.##..##..##..##..#+
+.##..##..##..##..#+
+.##..##..##..##..#+
+..................+
+........X.........+
+..................+
+$$$$$$$$$$$$$$$$$$+


In [56]:
def sim_loop():
    map_obj = wm.init_wh_map(wm.wh_vis_map)
    agent_obj = wo.Agent(
        coordinates=(18,9)
    )
    availible_actions = set(['w', 'a', 's', 'd', 'q', 't', 'g', 'i', 'r'])
    score = 0
    render_map(map_obj, agent_obj)
    while True:
        while True:
            action = input()
            if action in availible_actions:
                break
        if action == 'w':
            r = agent_obj.move(to='u',map_obj=map_obj)
            if r == 0:
                score -= 10
        elif action == 'a':
            r = agent_obj.move(to='l',map_obj=map_obj)
            if r == 0:
                score -= 10
        elif action == 's':
            r = agent_obj.move(to='d',map_obj=map_obj)
            if r == 0:
                score -= 10
        elif action == 'd':
            r = agent_obj.move(to='r',map_obj=map_obj)
            if r == 0:
                score -= 10
        elif action == 'q':
            print('Breaking simulation.')
            break
        elif action == 't':
            r = agent_obj.take_product(product_name='MacBookPro', map_obj=map_obj)
            if r == 0:
                score -= 10
        elif action == 'g':
            r = agent_obj.put_product(product_name='MacBookPro', map_obj=map_obj)
            if r == 0:
                score -= 10
            elif r == -1:
                score -= 1000
            elif r == 10:
                score += 500
        elif action == 'i':
            r = agent_obj.inspect_shelf(map_obj=map_obj)
            if r == 0:
                score -= 10
            else:
                print(r)
        elif action == 'w':
            print('Waiting...')
        score -= 10
        render_map(map_obj, agent_obj)
        print(f'Score: {score}')

In [None]:
sim_loop()

In [11]:
from IPython.display import clear_output
from time import sleep

In [13]:
from src.envs.wh_env import WarehouseEnv

In [105]:
env = WarehouseEnv()

In [91]:
epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward < -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 1000
Penalties incurred: 391


In [None]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.05)
        
print_frames(frames)

##### Training q-learning agent

In [95]:
env.observation_space.shape

(4,)

In [120]:
def encode_state(state, encoder):
    state = tuple(state)
    if state not in encoder:
        encoder[state] = len(encoder)
    return encoder[state]

In [142]:
%%time
"""Training the agent"""

import random

q_table = np.zeros([18 * 18 * 4, env.action_space.n])

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.2
n_epoch = 100000

# For plotting metrics
all_epochs = []
all_penalties = []

env = WarehouseEnv()
encoder = dict()
for i in range(1, n_epoch + 1):
    state = env.reset()
    state = encode_state(state, encoder)
    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_state = encode_state(next_state, encoder)
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value
        
        if reward < -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 26min 19s, sys: 1min 34s, total: 27min 54s
Wall time: 26min 44s


In [146]:
q_table.shape

(1296, 8)

In [163]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    state = encode_state(state, encoder)
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
#         print(state, action)
        state, reward, done, info = env.step(action)
        state = encode_state(state, encoder)
        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 1000.0
Average penalties per episode: 999.0


In [164]:
epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

state = env.reset()
state = encode_state(state, encoder)
while not done:
    action = np.argmax(q_table[state])
    state, reward, done, info = env.step(action)
    state = encode_state(state, encoder)
    if reward < -10:
        penalties += 1
    
    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1
    
    
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 1000
Penalties incurred: 0


In [165]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.05)
        
print_frames(frames)

++++++++++++++++++++
+.#..#..#..#..#..#.+
+.#..#..#X.#..#..#.+
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+..................+
+..................+
+.#######..#######.+
+..................+
+..................+
+.#######..#######.+
+..................+
+..................+
+.##..##..##..##..#+
+.##..##..##..##..#+
+.##..##..##..##..#+
+.##..##..##..##..#+
+..................+
+..................+
+..................+
+$$$$$$$$$$$$$$$$$$+
Timestep: 1000
State: 285
Action: 2
Reward: -10


In [26]:
from src.models.q_table import QTable

In [27]:
env = WarehouseEnv()

In [47]:
model = QTable(
    environment=env, 
    verbose=True,
    alpha=0.25,
    gamma=0.9,
    epsilon=0.15
)

In [48]:
model.train(n_epoch=40000)


100%|██████████| 40000/40000 [18:31<00:00, 36.00it/s]  

Training finished.






In [49]:
# model.verbose=True
model.evaluate_performance(episodes=100)

Results after 100 episodes:
Average timesteps per episode: 1000.0
Average penalties per episode: 999.0


{'avg_timesteps': 1000.0, 'avg_penalties': 999.0}

In [50]:
model.operate()

Timesteps taken: 1000
Penalties incurred: 0


In [None]:
model.show_operation()

++++++++++++++++++++
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+.#..#..#..#..#..#.+
+..................+
+..................+
+.#######..#######.+
+..................+
+..................+
+.#######..#######.+
+..................+
+..................+
+.##..##..##..##..#+
+.##..##..##..##..#+
+.##..##..##..##..#+
+.##..##..##..##..#+
+..................+
+..................+
+.........X........+
+$$$$$$$$$$$$$$$$$$+
Timestep: 401
State: 666
Action: 7
Reward: -10
