In [1]:
import os
import time
import copy
import multiprocessing as mp

import random
import numpy as np
from collections import namedtuple

from test_env import TestEnv_v2
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 50
LEARNING_RATE = 0.001

USE_CORES = 12

In [3]:
class Net(nn.Module):
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size[1], hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, int(hidden_size/2)),
            nn.ReLU()
        )
        self.out = nn.Linear(obs_size[0] * int(hidden_size/2), n_actions) 

    def forward(self, x):
        x = self.net(x)
        x = x.view(x.size(0), -1)   # to (batch_size, obs_size[0] * hidden_size/2)
        output = self.out(x)
        return output

Episode = namedtuple('Episode', field_names=['reward', 'steps', 'info'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [4]:
def produce_batches(env_input, net, batch_size):
    env = copy.deepcopy(env_input)
    np.random.seed()
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    sm = nn.Softmax(dim=1)
    while True:
        obs_v = torch.FloatTensor([obs]).cuda()
        act_probs_v = sm(net(obs_v))
        act_probs = act_probs_v.cpu().data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        next_obs, reward, is_done, ext_info = env.step(action)
        episode_reward += reward
        episode_steps.append(EpisodeStep(observation=obs, action=action))
        if is_done or ext_info[3] > 5000:
            batch.append(Episode(reward=episode_reward, steps=episode_steps, info=ext_info))
            print("round_mean=%.1f, distance_mean=%.1f, steps=%d "% \
              (ext_info[1], ext_info[2], ext_info[3]))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                del env, episode_steps, obs, obs_v, act_probs_v, act_probs, next_obs, ext_info
                return batch
        obs = next_obs

In [5]:
def filter_batch(batch, percentile):
    rewards = list(map(lambda s: s.reward, batch))
    infos = np.array(list(map(lambda s: s.info, batch)))
    reward_bound = np.percentile(rewards, percentile)
    reward_mean = float(np.mean(rewards))
    info_mean = np.mean(infos,axis=0)

    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        train_obs.extend(map(lambda step: step.observation, example.steps))
        train_act.extend(map(lambda step: step.action, example.steps))

    train_obs_v = torch.FloatTensor(train_obs).cuda()
    train_act_v = torch.LongTensor(train_act).cuda()
    return train_obs_v, train_act_v, reward_bound, reward_mean, info_mean

In [6]:
def get_init_batch(env_input,batch_size):
    env = copy.deepcopy(env_input)
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    r = 0
    test_num = 0
    while True:
        r = random.randint(0,99)
        next_obs, reward, is_done,  ext_info = env.step(r)
        episode_reward += reward
        test_num +=1
        episode_steps.append(EpisodeStep(observation=obs, action=r))
        if is_done:
            batch.append(Episode(reward=episode_reward, steps=episode_steps, info=ext_info))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                del env, episode_steps, obs, next_obs, ext_info
                return batch
                break;
        obs = next_obs

In [7]:
if __name__ == "__main__":
    env = TestEnv_v2()
    # env = gym.wrappers.Monitor(env, directory="mon", force=True)
    # os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
    # os.environ["CUDA_VISIBLE_DEVICES"]="1"
    
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    # print(device_lib.list_local_devices())
    
    # pool.close()
    
    pool = mp.Pool(processes = USE_CORES)
    
    obs_size = env.observation_size
    n_actions = env.action_num

    net = Net(obs_size, HIDDEN_SIZE, n_actions)
    
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
    net = nn.DataParallel(net)

    net = net.cuda()
    
    objective = nn.CrossEntropyLoss().cuda()
    optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)
    writer = SummaryWriter(comment="-test")

Let's use 4 GPUs!
round_mean=18.0, distance_mean=66.7, steps=2242 
round_mean=24.0, distance_mean=79.3, steps=2547 
round_mean=19.0, distance_mean=73.3, steps=2520 
round_mean=22.0, distance_mean=69.3, steps=2717 
round_mean=25.0, distance_mean=75.4, steps=2612 
round_mean=22.0, distance_mean=75.8, steps=2814 
round_mean=21.0, distance_mean=69.0, steps=3219 
round_mean=25.0, distance_mean=68.7, steps=3263 
round_mean=26.0, distance_mean=79.3, steps=3317 
round_mean=29.0, distance_mean=80.6, steps=3226 
round_mean=20.0, distance_mean=68.7, steps=3708 
round_mean=25.0, distance_mean=75.6, steps=3824 
round_mean=21.0, distance_mean=71.1, steps=1955 
round_mean=16.0, distance_mean=67.4, steps=2303 
round_mean=23.0, distance_mean=76.5, steps=2268 
round_mean=26.0, distance_mean=77.5, steps=3156 
round_mean=22.0, distance_mean=69.9, steps=1797 
round_mean=21.0, distance_mean=67.5, steps=2519 
round_mean=21.0, distance_mean=70.6, steps=2619 
round_mean=22.0, distance_mean=69.9, steps=3315 
ro

Process ForkPoolWorker-10:
Process ForkPoolWorker-12:
Process ForkPoolWorker-6:
Process ForkPoolWorker-11:
Process ForkPoolWorker-2:
Process ForkPoolWorker-8:
Process ForkPoolWorker-5:
Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Process ForkPoolWorker-7:
Process ForkPoolWorker-9:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ddli/.pyenv/versions/3

  File "<ipython-input-4-f05d04763aa1>", line 12, in produce_batches
    act_probs = act_probs_v.cpu().data.numpy()[0]
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 113, in forward
    replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/site-packages/torch/nn/modules/module.py", line 491, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/ddli/delivery/test_env.py", line 230, in _get_state
    self.state[i][j] = self.point_state[i].state()[j]
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py", line 121, in scatter
    return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
KeyboardInterrupt
  File "/home/ddli/.pyenv/versions/3.6.7/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py", line 68, in gather
    return gather_map(outputs)
  File "/home/ddli/.pyenv/

In [8]:
    optimizer = optim.Adam(params=net.parameters(), lr=0.0005)
    for t in range(3):
        print("test round:%d"% (t))
        multi_res = []
        for i in range(USE_CORES):
            multi_res.append(pool.apply_async(get_init_batch, (env,BATCH_SIZE)))

        for i in range(USE_CORES):
            batch = multi_res[i].get()  
            obs_v, acts_v, reward_b, reward_m, info_m = filter_batch(batch, 10)
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()
            # writer.add_scalar("loss", loss_v.item())
            # writer.add_scalar("reward_bound", reward_b)
            # writer.add_scalar("reward_mean", reward_m)
            
        print("loss=%.3f, reward_mean=%.1f, reward_bound=%.1f, round_mean=%.1f, distance_mean=%.1f" % (
                        loss_v.item(), reward_m, reward_b, info_m[1], info_m[2]))

    torch.save(net.state_dict(), 'net_params_init.pkl')  

test round:0
loss=6.283, reward_mean=295.5, reward_bound=266.9, round_mean=30.5, distance_mean=84.6
test round:1
loss=4.669, reward_mean=322.9, reward_bound=288.9, round_mean=28.7, distance_mean=80.7
test round:2
loss=4.607, reward_mean=325.9, reward_bound=291.4, round_mean=29.1, distance_mean=80.3
test round:3
loss=4.605, reward_mean=324.6, reward_bound=279.3, round_mean=30.5, distance_mean=83.5
test round:4
loss=4.605, reward_mean=322.9, reward_bound=275.5, round_mean=30.1, distance_mean=82.7


In [8]:
    optimizer = optim.Adam(params=net.parameters(), lr=LEARNING_RATE)
    print("start train!")
    writer = SummaryWriter(comment="-test")
    iter_no = 0
    train_no = 0
    t0 = time.time()
    # net.load_state_dict(torch.load('net_params_init.pkl'))
    net.load_state_dict(torch.load('net_params.pkl'))
    while True:
        multi_res = []
        
        for i in range(USE_CORES):
            multi_res.append(pool.apply_async(produce_batches, (env, net, BATCH_SIZE)))
        
        # pool.close()
        # pool.join()
            
        for i in range(USE_CORES):
            batch = multi_res[i].get()   
            
            obs_v, acts_v, reward_b, reward_m, info_m = filter_batch(batch, PERCENTILE)
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()
            
            train_no = iter_no * USE_CORES + i
            if iter_no%5 == 0:
                torch.save(net.state_dict(), 'net_params.pkl')   
            print("%d: loss=%.3f, reward_mean=%.1f, reward_bound=%.1f, round_mean=%.1f, distance_mean=%.1f" % (
                train_no, loss_v.item(), reward_m, reward_b, info_m[1], info_m[2]))
            writer.add_scalar("loss", loss_v.item(), train_no)
            writer.add_scalar("reward_bound", reward_b, train_no)
            writer.add_scalar("reward_mean", reward_m, train_no)
            writer.add_scalar("round_mean", info_m[1], train_no)
            writer.add_scalar("distance_mean", info_m[2], train_no)
            
            if reward_m > 500:
                print("Solved!")
                break
        iter_no += 1      
        t1 = time.time()
        localtime = time.asctime( time.localtime(time.time()) )
        print("iter time: %d s, localtime: "% (int(t1 - t0)), localtime)
        t0 = t1
        del loss_v, acts_v, obs_v, action_scores_v, info_m, multi_res, reward_m
        torch.cuda.empty_cache()
            
    writer.close()
    pool.close()
    

start train!


KeyboardInterrupt: 