In [5]:
import os
import time
import copy
import multiprocessing as mp
import sys
sys.path.append("..")

import random
import numpy as np
from collections import namedtuple

from envs.test_env_v2 import TestEnv_v2
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

In [6]:
HIDDEN_SIZE = 128
BATCH_SIZE = 64
PERCENTILE = 70
LEARNING_RATE = 0.0005
REUSE_TIMES = 6
GAMMA = 1.001

USE_CORES = 8

INIT_ROUNDS = 10

In [7]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size[1], hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, int(hidden_size/2)),
            nn.ReLU()
        )
        self.out = nn.Linear(obs_size[0] * int(hidden_size/2), n_actions) 

    def forward(self, x):
        x = self.net(x)
        x = x.view(x.size(0), -1)   # to (batch_size, obs_size[0] * hidden_size/2)
        output = self.out(x)
        return output

Episode = namedtuple('Episode', field_names=['reward', 'steps', 'info'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [8]:
global total_batch
def produce_batches(env_input, net, batch_size):
    env = copy.deepcopy(env_input)
    np.random.seed()
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    times = 0
    
    while True:
        obs_v = torch.FloatTensor([obs]).cuda()
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.cpu().data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, ext_info = env.step(action)
        episode_reward += reward * (GAMMA ** len(episode_steps))
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if is_done or ext_info[3] > 10000:
            batch.append(Episode(reward=episode_reward, steps=episode_steps, info=ext_info))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()

#             print("%d"% (times%8), end='', flush=True)
#             times += 1
#             if times%8 == 0:
#                 print("|", end='')

            if len(batch) == batch_size:
#                 print(" ")
                times = 0
                del obs_v, act_probs_v, act_probs, ext_info
                return batch

        obs = next_obs

def collect_results(mini_batch):
    total_batch.extend(mini_batch)

def apply_async_with_callback(pool, core_num, env, net, batch_size):
    core_thrd = []
    for _ in range(core_num):
        core_thrd.append(pool.apply_async(produce_batches, args=(env, net, int(batch_size/core_num)),
                         callback=collect_results))
    for i in range(core_num):
        core_thrd[i].get()

In [9]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    infos = np.array(list(map(lambda s: s.info, batch)))
    info_mean = np.mean(infos,axis=0)

    train_obs = []
    train_act = []
    elite_batch = []
    for example, discounted_reward in zip(batch, rewards):
        if discounted_reward > reward_bound:
            train_obs.extend(map(lambda step: step.observation, example.steps))
            train_act.extend(map(lambda step: step.action, example.steps))
            elite_batch.append(example)

    return elite_batch, train_obs, train_act, reward_bound, reward_mean, info_mean

In [10]:
def get_init_batch(env,batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    r = 0
    test_num = 0
    times = 0
    while True:
        r = random.randint(0,99)
        next_obs, reward, is_done,  ext_info = env.step(r)
        episode_reward += reward * (GAMMA ** len(episode_steps))
        episode_steps.append(EpisodeStep(observation=obs, action=r))
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps, info=ext_info))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if times%8 == 0:
                print("|", end='')
            print("%d"% (times%8), end='',flush=True)
            times += 1
            if len(batch) == batch_size:
                print(" ")
                yield batch
                batch = []
                test_num +=1
                times = 0
                if test_num == INIT_ROUNDS:
                    break
        obs = next_obs

In [11]:
if __name__ == "__main__":
    env = TestEnv_v2()
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
    # os.environ["CUDA_VISIBLE_DEVICES"]="1"
    
    
    # print(device_lib.list_local_devices())
    
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    pool = mp.Pool(processes = USE_CORES)
    
    
    obs_size = env.observation_size
    n_actions = env.action_num

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        net = nn.DataParallel(net)

    net = net.cuda()
    
    objective = nn.CrossEntropyLoss().cuda()
    optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)
    writer = SummaryWriter(comment="-test")

Let's use 4 GPUs!


In [None]:
    writer = SummaryWriter(comment="-test")
    
    # net.load_state_dict(torch.load('net_params_init.pkl'))
    net.load_state_dict(torch.load('net_params.pkl'))
    optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)
    
    print("start training!!!")
    start = time.time()
    
    res_batch = []
    iter_no = 0
    while True:
        total_batch = []
        apply_async_with_callback(pool, USE_CORES, env, net, BATCH_SIZE)
        
        reward_mean = float(np.mean(list(map(lambda s: s.reward, total_batch))))
        res_batch, obs, acts, reward_b, reward_m, info_m = filter_batch(res_batch + total_batch, PERCENTILE)
        if not res_batch:
            continue
        
        iter_no += 1
        
        localtime = time.asctime( time.localtime(time.time()) )
        print("%d: reward_mean=%.1f, reward_bound=%.1f, round_mean=%.1f, \
        distance_mean=%.1f, steps=%d "% \
              (iter_no,  reward_m, reward_b, info_m[1], info_m[2], info_m[3]))
        
        
        obs_v = torch.FloatTensor(obs).cuda()
        acts_v = torch.LongTensor(acts).cuda()
        res_batch = res_batch[-32:]

        for i in range(REUSE_TIMES):
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()
            
            print("-", end='')
        print(" ")
        if iter_no%10 == 0:
            torch.save(net.state_dict(), 'net_params.pkl')   
            
        end = time.time()
        print("iter time: %d s, localtime:"% (int(end - start)), localtime)
        start = end    
        
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
        writer.add_scalar("round_mean", info_m[1], iter_no)
        writer.add_scalar("distance_mean", info_m[2], iter_no)
        writer.add_scalar("step_mean", info_m[3], iter_no)
        # if reward_m > 500:
        #     print("Solved!")
        #     break
        
        del loss_v, acts_v, obs_v, action_scores_v, info_m, reward_m
        torch.cuda.empty_cache()
    writer.close()

start training!!!
1: reward_mean=-12284.9, reward_bound=-8278.5, round_mean=32.9,     distance_mean=79.8, steps=1131 
------ 
iter time: 53 s, localtime: Wed Nov 21 14:01:34 2018
2: reward_mean=-10592.0, reward_bound=-6954.4, round_mean=33.5,     distance_mean=80.6, steps=1052 
------ 
iter time: 30 s, localtime: Wed Nov 21 14:02:18 2018
3: reward_mean=-10471.7, reward_bound=-6807.6, round_mean=32.8,     distance_mean=80.3, steps=1050 
------ 
iter time: 31 s, localtime: Wed Nov 21 14:02:49 2018
4: reward_mean=-10735.3, reward_bound=-6732.6, round_mean=32.5,     distance_mean=80.0, steps=1052 
------ 
iter time: 31 s, localtime: Wed Nov 21 14:03:21 2018
5: reward_mean=-10629.0, reward_bound=-6436.6, round_mean=33.5,     distance_mean=80.6, steps=1047 
------ 
iter time: 31 s, localtime: Wed Nov 21 14:03:52 2018
6: reward_mean=-10736.5, reward_bound=-6237.1, round_mean=33.1,     distance_mean=79.9, steps=1044 
------ 
iter time: 30 s, localtime: Wed Nov 21 14:04:23 2018
7: reward_mean=-

In [None]:
    optimizer = optim.Adam(params=net.parameters(), lr=0.001)
    
    for batch in get_init_batch(env, 32):
        batch, obs, acts, reward_b, reward_m, info_m = filter_batch(batch, 30)

        obs_v = torch.FloatTensor(obs).cuda()
        acts_v = torch.LongTensor(acts).cuda()

        for i in range(5):
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()
            print("-", end='')
        print(" ")
            
        print("reward_mean=%.1f, reward_bound=%.1f, round_mean=%.1f, \
        distance_mean=%.1f, steps=%d "% \
          (reward_m, reward_b, info_m[1], info_m[2], info_m[3]))
        del loss_v, acts_v, obs_v, action_scores_v, info_m, reward_m
        torch.cuda.empty_cache()
        
    torch.cuda.empty_cache()
    torch.save(net.state_dict(), 'net_params_init.pkl')  