In [4]:
import os
import time
import copy
import multiprocessing as mp
import sys
sys.path.append("..")

import random
import numpy as np
from collections import namedtuple

from envs.test_env_v2 import TestEnv_v2
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
HIDDEN_SIZE = 128
BATCH_SIZE = 64
PERCENTILE = 70
LEARNING_RATE = 0.0005
REUSE_TIMES = 6
GAMMA = 1.001

USE_CORES = 8

INIT_ROUNDS = 10

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size[1], hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, int(hidden_size/2)),
            nn.ReLU()
        )
        self.out = nn.Linear(obs_size[0] * int(hidden_size/2), n_actions) 

    def forward(self, x):
        x = self.net(x)
        x = x.view(x.size(0), -1)   # to (batch_size, obs_size[0] * hidden_size/2)
        output = self.out(x)
        return output

Episode = namedtuple('Episode', field_names=['reward', 'steps', 'info'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [4]:
global total_batch
def produce_batches(env_input, net, batch_size):
    env = copy.deepcopy(env_input)
    np.random.seed()
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    times = 0
    
    while True:
        obs_v = torch.FloatTensor([obs]).cuda()
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.cpu().data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, ext_info = env.step(action)
        episode_reward += reward * (GAMMA ** len(episode_steps))
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if is_done or ext_info[3] > 10000:
            batch.append(Episode(reward=episode_reward, steps=episode_steps, info=ext_info))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()

#             print("%d"% (times%8), end='', flush=True)
#             times += 1
#             if times%8 == 0:
#                 print("|", end='')

            if len(batch) == batch_size:
#                 print(" ")
                times = 0
                del obs_v, act_probs_v, act_probs, ext_info
                return batch

        obs = next_obs

def collect_results(mini_batch):
    total_batch.extend(mini_batch)

def apply_async_with_callback(pool, core_num, env, net, batch_size):
    core_thrd = []
    for _ in range(core_num):
        core_thrd.append(pool.apply_async(produce_batches, args=(env, net, int(batch_size/core_num)),
                         callback=collect_results))
    for i in range(core_num):
        core_thrd[i].get()

In [5]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    infos = np.array(list(map(lambda s: s.info, batch)))
    info_mean = np.mean(infos,axis=0)

    train_obs = []
    train_act = []
    elite_batch = []
    for example, discounted_reward in zip(batch, rewards):
        if discounted_reward > reward_bound:
            train_obs.extend(map(lambda step: step.observation, example.steps))
            train_act.extend(map(lambda step: step.action, example.steps))
            elite_batch.append(example)

    return elite_batch, train_obs, train_act, reward_bound, reward_mean, info_mean

In [6]:
def get_init_batch(env,batch_size):
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    r = 0
    test_num = 0
    times = 0
    while True:
        r = random.randint(0,99)
        next_obs, reward, is_done,  ext_info = env.step(r)
        episode_reward += reward * (GAMMA ** len(episode_steps))
        episode_steps.append(EpisodeStep(observation=obs, action=r))
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps, info=ext_info))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if times%8 == 0:
                print("|", end='')
            print("%d"% (times%8), end='',flush=True)
            times += 1
            if len(batch) == batch_size:
                print(" ")
                yield batch
                batch = []
                test_num +=1
                times = 0
                if test_num == INIT_ROUNDS:
                    break
        obs = next_obs

In [7]:
if __name__ == "__main__":
    env = TestEnv_v2()
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
    # os.environ["CUDA_VISIBLE_DEVICES"]="1"
    
    
    # print(device_lib.list_local_devices())
    
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    pool = mp.Pool(processes = USE_CORES)
    
    
    obs_size = env.observation_size
    n_actions = env.action_num

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        net = nn.DataParallel(net)

    net = net.cuda()
    
    objective = nn.CrossEntropyLoss().cuda()
    optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)
    writer = SummaryWriter(comment="-test")

Let's use 4 GPUs!


Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Process ForkPoolWorker-4:
Process ForkPoolWorker-5:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-2:
Process ForkPoolWorker-6:
Traceback (most recent call last):
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/multiprocessing/process.py", line 9

In [8]:
    writer = SummaryWriter(comment="-test")
    
    # net.load_state_dict(torch.load('net_params_init.pkl'))
    net.load_state_dict(torch.load('net_params.pkl'))
    optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)
    
    print("start training!!!")
    start = time.time()
    
    res_batch = []
    iter_no = 0
    while True:
        total_batch = []
        apply_async_with_callback(pool, USE_CORES, env, net, BATCH_SIZE)
        
        reward_mean = float(np.mean(list(map(lambda s: s.reward, total_batch))))
        res_batch, obs, acts, reward_b, reward_m, info_m = filter_batch(res_batch + total_batch, PERCENTILE)
        if not res_batch:
            continue
        
        iter_no += 1
        
        localtime = time.asctime( time.localtime(time.time()) )
        print("%d: reward_mean=%.1f, reward_bound=%.1f, round_mean=%.1f, \
        distance_mean=%.1f, steps=%d "% \
              (iter_no,  reward_m, reward_b, info_m[1], info_m[2], info_m[3]))
        
        
        obs_v = torch.FloatTensor(obs).cuda()
        acts_v = torch.LongTensor(acts).cuda()
        res_batch = res_batch[-32:]

        for i in range(REUSE_TIMES):
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()
            
            print("-", end='')
        print(" ")
        if iter_no%10 == 0:
            torch.save(net.state_dict(), 'net_params.pkl')   
            
        end = time.time()
        print("iter time: %d s, localtime:"% (int(end - start)), localtime)
        start = end    
        
        writer.add_scalar("loss", loss_v.item(), iter_no)
        writer.add_scalar("reward_bound", reward_b, iter_no)
        writer.add_scalar("reward_mean", reward_m, iter_no)
        writer.add_scalar("round_mean", info_m[1], iter_no)
        writer.add_scalar("distance_mean", info_m[2], iter_no)
        writer.add_scalar("step_mean", info_m[3], iter_no)
        # if reward_m > 500:
        #     print("Solved!")
        #     break
        
        del loss_v, acts_v, obs_v, action_scores_v, info_m, reward_m
        torch.cuda.empty_cache()
    writer.close()

start training!!!
1: reward_mean=-12269.3, reward_bound=-9045.0, round_mean=32.3,     distance_mean=84.8, steps=1147 
------ 
iter time: 51 s, localtime: Wed Nov 21 09:32:52 2018
2: reward_mean=-10364.5, reward_bound=-7601.2, round_mean=32.2,     distance_mean=84.7, steps=1058 
------ 
iter time: 30 s, localtime: Wed Nov 21 09:33:35 2018
3: reward_mean=-10537.5, reward_bound=-7210.6, round_mean=32.3,     distance_mean=84.4, steps=1051 
------ 
iter time: 32 s, localtime: Wed Nov 21 09:34:07 2018
4: reward_mean=-11879.0, reward_bound=-6715.8, round_mean=32.6,     distance_mean=85.1, steps=1086 
------ 
iter time: 33 s, localtime: Wed Nov 21 09:34:41 2018
5: reward_mean=-10659.3, reward_bound=-6526.1, round_mean=31.2,     distance_mean=83.7, steps=1049 
------ 
iter time: 31 s, localtime: Wed Nov 21 09:35:13 2018
6: reward_mean=-10802.2, reward_bound=-6181.8, round_mean=31.7,     distance_mean=84.1, steps=1055 
------ 
iter time: 31 s, localtime: Wed Nov 21 09:35:45 2018
7: reward_mean=-

52: reward_mean=-9047.6, reward_bound=-4546.4, round_mean=31.7,     distance_mean=84.1, steps=968 
------ 
iter time: 29 s, localtime: Wed Nov 21 09:59:34 2018
53: reward_mean=-10014.8, reward_bound=-4497.4, round_mean=32.1,     distance_mean=84.5, steps=1012 
------ 
iter time: 31 s, localtime: Wed Nov 21 10:00:06 2018
54: reward_mean=-9442.1, reward_bound=-4482.0, round_mean=32.0,     distance_mean=84.4, steps=986 
------ 
iter time: 29 s, localtime: Wed Nov 21 10:00:36 2018
55: reward_mean=-10086.7, reward_bound=-4455.4, round_mean=32.4,     distance_mean=84.7, steps=1002 
------ 
iter time: 31 s, localtime: Wed Nov 21 10:01:07 2018
56: reward_mean=-9352.0, reward_bound=-4428.2, round_mean=31.2,     distance_mean=83.3, steps=966 
------ 
iter time: 29 s, localtime: Wed Nov 21 10:01:36 2018
57: reward_mean=-8981.4, reward_bound=-4415.5, round_mean=31.6,     distance_mean=83.7, steps=959 
------ 
iter time: 28 s, localtime: Wed Nov 21 10:02:05 2018
58: reward_mean=-11396.8, reward_bou

103: reward_mean=-10323.9, reward_bound=-4275.5, round_mean=31.6,     distance_mean=83.9, steps=1011 
------ 
iter time: 30 s, localtime: Wed Nov 21 10:25:41 2018
104: reward_mean=-15430.1, reward_bound=-4769.7, round_mean=31.8,     distance_mean=84.0, steps=1061 
------ 
iter time: 34 s, localtime: Wed Nov 21 10:26:15 2018
105: reward_mean=-10267.6, reward_bound=-4601.0, round_mean=32.1,     distance_mean=84.4, steps=1012 
------ 
iter time: 32 s, localtime: Wed Nov 21 10:26:47 2018
106: reward_mean=-9403.3, reward_bound=-4598.4, round_mean=32.0,     distance_mean=84.5, steps=972 
------ 
iter time: 30 s, localtime: Wed Nov 21 10:27:18 2018
107: reward_mean=-9146.0, reward_bound=-4382.1, round_mean=31.8,     distance_mean=83.7, steps=957 
------ 
iter time: 28 s, localtime: Wed Nov 21 10:27:47 2018
108: reward_mean=-8763.1, reward_bound=-4668.9, round_mean=32.4,     distance_mean=84.8, steps=946 
------ 
iter time: 28 s, localtime: Wed Nov 21 10:28:15 2018
109: reward_mean=-10587.4, r

154: reward_mean=-8830.1, reward_bound=-4270.6, round_mean=31.6,     distance_mean=84.0, steps=942 
------ 
iter time: 30 s, localtime: Wed Nov 21 10:51:31 2018
155: reward_mean=-10092.6, reward_bound=-4174.4, round_mean=31.4,     distance_mean=83.6, steps=987 
------ 
iter time: 30 s, localtime: Wed Nov 21 10:52:02 2018
156: reward_mean=-9254.6, reward_bound=-4336.8, round_mean=31.7,     distance_mean=83.9, steps=964 
------ 
iter time: 29 s, localtime: Wed Nov 21 10:52:31 2018
157: reward_mean=-10395.1, reward_bound=-4651.0, round_mean=31.5,     distance_mean=83.5, steps=1009 
------ 
iter time: 31 s, localtime: Wed Nov 21 10:53:02 2018
158: reward_mean=-10021.4, reward_bound=-4743.3, round_mean=32.3,     distance_mean=84.7, steps=993 
------ 
iter time: 31 s, localtime: Wed Nov 21 10:53:33 2018
159: reward_mean=-9856.7, reward_bound=-4525.4, round_mean=32.2,     distance_mean=84.6, steps=995 
------ 
iter time: 32 s, localtime: Wed Nov 21 10:54:05 2018
160: reward_mean=-10494.9, rew

205: reward_mean=-9316.6, reward_bound=-4228.4, round_mean=32.0,     distance_mean=84.1, steps=955 
------ 
iter time: 29 s, localtime: Wed Nov 21 11:17:17 2018
206: reward_mean=-9674.0, reward_bound=-4473.5, round_mean=31.7,     distance_mean=84.3, steps=971 
------ 
iter time: 29 s, localtime: Wed Nov 21 11:17:47 2018
207: reward_mean=-9949.1, reward_bound=-4303.8, round_mean=31.8,     distance_mean=84.0, steps=984 
------ 
iter time: 31 s, localtime: Wed Nov 21 11:18:18 2018
208: reward_mean=-10058.3, reward_bound=-4379.8, round_mean=31.9,     distance_mean=84.0, steps=987 
------ 
iter time: 30 s, localtime: Wed Nov 21 11:18:48 2018
209: reward_mean=-9162.5, reward_bound=-4462.0, round_mean=31.8,     distance_mean=84.0, steps=952 
------ 
iter time: 29 s, localtime: Wed Nov 21 11:19:17 2018
210: reward_mean=-9150.0, reward_bound=-4224.6, round_mean=31.4,     distance_mean=83.5, steps=950 
------ 
iter time: 29 s, localtime: Wed Nov 21 11:19:46 2018
211: reward_mean=-9858.6, reward_

256: reward_mean=-9892.6, reward_bound=-3964.0, round_mean=31.8,     distance_mean=83.6, steps=981 
------ 
iter time: 30 s, localtime: Wed Nov 21 11:42:49 2018
257: reward_mean=-8990.8, reward_bound=-4071.3, round_mean=31.4,     distance_mean=83.5, steps=944 
------ 
iter time: 29 s, localtime: Wed Nov 21 11:43:18 2018
258: reward_mean=-8767.7, reward_bound=-4065.0, round_mean=31.6,     distance_mean=83.3, steps=935 
------ 
iter time: 28 s, localtime: Wed Nov 21 11:43:46 2018
259: reward_mean=-10278.9, reward_bound=-4242.0, round_mean=31.7,     distance_mean=83.5, steps=1002 
------ 
iter time: 30 s, localtime: Wed Nov 21 11:44:17 2018
260: reward_mean=-10256.0, reward_bound=-3952.4, round_mean=31.8,     distance_mean=83.8, steps=991 
------ 
iter time: 31 s, localtime: Wed Nov 21 11:44:49 2018
261: reward_mean=-9854.4, reward_bound=-4298.7, round_mean=31.8,     distance_mean=84.0, steps=985 
------ 
iter time: 29 s, localtime: Wed Nov 21 11:45:19 2018
262: reward_mean=-10413.2, rewa

307: reward_mean=-9296.2, reward_bound=-3926.1, round_mean=32.0,     distance_mean=84.2, steps=956 
------ 
iter time: 29 s, localtime: Wed Nov 21 12:08:30 2018
308: reward_mean=-9811.9, reward_bound=-3933.0, round_mean=31.5,     distance_mean=83.7, steps=984 
------ 
iter time: 30 s, localtime: Wed Nov 21 12:09:00 2018
309: reward_mean=-9023.5, reward_bound=-4110.4, round_mean=32.4,     distance_mean=84.6, steps=931 
------ 
iter time: 28 s, localtime: Wed Nov 21 12:09:29 2018
310: reward_mean=-9659.0, reward_bound=-4225.6, round_mean=31.4,     distance_mean=83.3, steps=965 
------ 
iter time: 30 s, localtime: Wed Nov 21 12:09:59 2018
311: reward_mean=-9588.9, reward_bound=-4260.5, round_mean=31.9,     distance_mean=84.1, steps=969 
------ 
iter time: 30 s, localtime: Wed Nov 21 12:10:30 2018
312: reward_mean=-9136.2, reward_bound=-4138.1, round_mean=31.7,     distance_mean=83.9, steps=953 
------ 
iter time: 29 s, localtime: Wed Nov 21 12:10:59 2018
313: reward_mean=-9203.2, reward_b

358: reward_mean=-9847.7, reward_bound=-3977.0, round_mean=31.5,     distance_mean=83.9, steps=984 
------ 
iter time: 31 s, localtime: Wed Nov 21 12:33:52 2018
359: reward_mean=-9641.2, reward_bound=-4336.6, round_mean=32.2,     distance_mean=84.7, steps=968 
------ 
iter time: 29 s, localtime: Wed Nov 21 12:34:21 2018
360: reward_mean=-9085.2, reward_bound=-3826.4, round_mean=32.2,     distance_mean=84.6, steps=948 
------ 
iter time: 28 s, localtime: Wed Nov 21 12:34:50 2018
361: reward_mean=-9701.2, reward_bound=-4069.9, round_mean=32.3,     distance_mean=84.5, steps=969 
------ 
iter time: 29 s, localtime: Wed Nov 21 12:35:20 2018
362: reward_mean=-10597.1, reward_bound=-3815.0, round_mean=32.3,     distance_mean=84.8, steps=1009 
------ 
iter time: 32 s, localtime: Wed Nov 21 12:35:52 2018
363: reward_mean=-10047.7, reward_bound=-4302.8, round_mean=32.3,     distance_mean=84.9, steps=991 
------ 
iter time: 31 s, localtime: Wed Nov 21 12:36:23 2018
364: reward_mean=-10152.5, rewa

409: reward_mean=-10907.5, reward_bound=-4037.0, round_mean=31.6,     distance_mean=83.8, steps=1010 
------ 
iter time: 31 s, localtime: Wed Nov 21 12:59:26 2018
410: reward_mean=-9466.0, reward_bound=-4000.5, round_mean=32.0,     distance_mean=84.6, steps=968 
------ 
iter time: 29 s, localtime: Wed Nov 21 12:59:56 2018
411: reward_mean=-9154.2, reward_bound=-3809.6, round_mean=31.8,     distance_mean=83.8, steps=938 
------ 
iter time: 28 s, localtime: Wed Nov 21 13:00:24 2018
412: reward_mean=-10374.1, reward_bound=-4218.4, round_mean=31.9,     distance_mean=84.0, steps=1004 
------ 
iter time: 31 s, localtime: Wed Nov 21 13:00:55 2018
413: reward_mean=-8664.5, reward_bound=-4031.5, round_mean=32.3,     distance_mean=84.5, steps=925 
------ 
iter time: 28 s, localtime: Wed Nov 21 13:01:23 2018
414: reward_mean=-9809.1, reward_bound=-4130.0, round_mean=32.1,     distance_mean=84.5, steps=986 
------ 
iter time: 30 s, localtime: Wed Nov 21 13:01:54 2018
415: reward_mean=-9267.0, rewa

460: reward_mean=-9822.2, reward_bound=-3680.0, round_mean=32.0,     distance_mean=84.5, steps=982 
------ 
iter time: 30 s, localtime: Wed Nov 21 13:25:03 2018
461: reward_mean=-9130.6, reward_bound=-4059.2, round_mean=32.0,     distance_mean=84.6, steps=946 
------ 
iter time: 28 s, localtime: Wed Nov 21 13:25:32 2018
462: reward_mean=-11401.3, reward_bound=-3698.6, round_mean=31.8,     distance_mean=83.8, steps=992 
------ 
iter time: 32 s, localtime: Wed Nov 21 13:26:04 2018
463: reward_mean=-9436.2, reward_bound=-3845.4, round_mean=32.5,     distance_mean=85.1, steps=966 
------ 
iter time: 30 s, localtime: Wed Nov 21 13:26:35 2018
464: reward_mean=-9432.5, reward_bound=-4084.9, round_mean=31.6,     distance_mean=83.9, steps=966 
------ 
iter time: 29 s, localtime: Wed Nov 21 13:27:05 2018
465: reward_mean=-9178.7, reward_bound=-4067.9, round_mean=31.6,     distance_mean=83.9, steps=948 
------ 
iter time: 28 s, localtime: Wed Nov 21 13:27:34 2018
466: reward_mean=-9722.1, reward_

KeyboardInterrupt: 

In [None]:
    optimizer = optim.Adam(params=net.parameters(), lr=0.001)
    
    for batch in get_init_batch(env, 32):
        batch, obs, acts, reward_b, reward_m, info_m = filter_batch(batch, 30)

        obs_v = torch.FloatTensor(obs).cuda()
        acts_v = torch.LongTensor(acts).cuda()

        for i in range(5):
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()
            print("-", end='')
        print(" ")
            
        print("reward_mean=%.1f, reward_bound=%.1f, round_mean=%.1f, \
        distance_mean=%.1f, steps=%d "% \
          (reward_m, reward_b, info_m[1], info_m[2], info_m[3]))
        del loss_v, acts_v, obs_v, action_scores_v, info_m, reward_m
        torch.cuda.empty_cache()
        
    torch.cuda.empty_cache()
    torch.save(net.state_dict(), 'net_params_init.pkl')  