In [30]:
import sys
sys.path.append('../../')
import torch
import torch.nn as nn
import torch.optim as optim
from Code.envs.GPEnv import MultiEnv
from wip.Code.train_q import make_dataset, backward_one
import time
from collections import OrderedDict



In [2]:

BATCH_SIZE = 512#64
SIM_TIME = 1
MAX_ITER = 16
USE_JIT = False
DIMS = 2
NUM_ACTIONS = 32
MEM_SIZE = 128+MAX_ITER

device = torch.device('cuda')

env = MultiEnv(BATCH_SIZE, MAX_ITER, device, dims=DIMS)

#torch.backends.cudnn.enabled = False




In [3]:
from Code.Networks import OuterWrapper
#model = torch.load('../../models/rsnn_gppred4')
model = torch.load('../../models/rsnn_gppred2d1')
model_memory = model.layers.mem_loop.model.model

#model = OuterWrapper(model, device, USE_JIT)
model_memory = OuterWrapper(model_memory, device, USE_JIT, two_dim=True)

In [4]:
from Code.Networks import Selector, DynNetwork, OuterWrapper, LSTMWrapper, ReLuWrapper, DummyNeuron, make_SequenceWrapper
from Code.NewNeurons import SeqOnlySpike, CooldownNeuron

s_architecture = OrderedDict([
    ('input', MEM_SIZE), #128
    ('layer1', [['input'], ReLuWrapper(256), nn.Linear]),
    ('output', [['layer1'], ReLuWrapper(64), nn.Linear]),
])

q_architecture = OrderedDict([
    ('input', 64+DIMS),
    ('layer1', [['input'], ReLuWrapper(128), nn.Linear]),
    ('layer2', [['layer1'], ReLuWrapper(64), nn.Linear]),
    ('output', [['layer2'], DummyNeuron(1), nn.Linear]),
])

p_architecture = OrderedDict([
    ('input', MEM_SIZE), #128
    ('layer1', [['input'], ReLuWrapper(256), nn.Linear]),
    ('layer2', [['layer1'], ReLuWrapper(128), nn.Linear]),
    ('output', [['layer2'], DummyNeuron(DIMS), nn.Linear]),
])


In [5]:
#144, 150, 137, 150

s_model = OuterWrapper(DynNetwork(s_architecture), device, USE_JIT)
q_model = OuterWrapper(DynNetwork(q_architecture), device, USE_JIT)
p_model = OuterWrapper(DynNetwork(p_architecture), device, USE_JIT)

#model = (OuterWrapper(DynNetwork(architecturelstm), device, True))



In [37]:
#params = list(s_model.parameters())+list(p_model.parameters())+list(q_model.parameters())
params = p_model.parameters()
optimizer = optim.Adam(params, lr=1e-4)#1e-4



In [38]:
def train_dataset(num_batches, num_epochs, gamma, rand_prob):
    obs, mem, value, actions, targets, avg_reward, _ = make_dataset(num_batches, BATCH_SIZE, MAX_ITER, s_model, q_model, p_model, model_memory, device, env, gamma, NUM_ACTIONS, DIMS, rand_prob)
    print('Avarage Reward: ', avg_reward)
    print('Action Variance: ', actions.var(0).mean())
    print('Avarage Start Value: ', value[0].mean())
    for e in range(num_epochs):
        start = time.time()
        idc = torch.randperm(num_batches*BATCH_SIZE*MAX_ITER, device=device)
        sum_v = 0
        sum_p = 0
        for i in range(num_batches*MAX_ITER):
            base = i*BATCH_SIZE
            batch_mem = mem.view(-1, MEM_SIZE)[idc[base:base + BATCH_SIZE]]
            batch_value = value.view(-1, 1)[idc[base:base + BATCH_SIZE]]
            batch_target = targets.view(-1, DIMS)[idc[base:base + BATCH_SIZE]]
            batch_action = actions.view(-1, DIMS)[idc[base:base + BATCH_SIZE]]
            s_model.zero_grad()
            q_model.zero_grad()
            p_model.zero_grad()
            lossv, lossp = backward_one(batch_action, batch_target, batch_mem, batch_value, s_model, q_model, p_model, device)
            sum_v += lossv
            sum_p += lossp
            optimizer.step()
        for p in model.parameters():
            if torch.isnan(p).any():
                raise Exception('Corrupted Model')
        print(sum_v / (num_batches*MAX_ITER), sum_p / (num_batches*MAX_ITER), time.time()-start)
            #if i%10 == 0:
                #print(loss.item(), (loss/targets.view(-1).var()).item(), i)

In [39]:
for i in range(100):
    print('Bigstep: ', i)
    train_dataset(200, 5, 1, 0.05)
    #env.render()

Bigstep:  0
Avarage Reward:  tensor(1.6712, device='cuda:0')
Action Variance:  tensor(0.1102, device='cuda:0')
Avarage Start Value:  tensor(1.7157, device='cuda:0')
0.027358374819741585 0.05745071545476094 19.047433137893677
0.027358374771429227 0.05628209276823327 19.119175910949707
0.027358374804607594 0.055789627300109716 18.997759342193604
0.02735837478976464 0.05543964232201688 19.086837768554688
0.027358374827890657 0.05515270865871571 19.161126136779785
Bigstep:  1
Avarage Reward:  tensor(1.6704, device='cuda:0')
Action Variance:  tensor(0.1095, device='cuda:0')
Avarage Start Value:  tensor(1.7144, device='cuda:0')
0.027397648101323283 0.05427356669562869 19.101285696029663
0.027397648014011792 0.053851604112423956 18.965389013290405
0.02739764810161432 0.053563567093806344 19.0363609790802
0.027397648071637377 0.053317585923941806 19.009596824645996
0.027397648075711913 0.05312126914737746 19.13011646270752
Bigstep:  2
Avarage Reward:  tensor(1.6648, device='cuda:0')
Action Var

KeyboardInterrupt: 

In [42]:
nacs = 8
obs, mem, value, actions, targets, avg_reward, p_ratio = make_dataset(100, BATCH_SIZE, MAX_ITER, s_model, q_model, p_model, model_memory, device, env, 1, nacs, DIMS, 0)
print('Avarage Reward: ', avg_reward)
print('Action Variance: ', actions.var(0).mean())
print('Avarage Start Value: ', value[0].mean())
print('Ratio of Actions taken from policy: ', p_ratio)
obs, mem, value, targets = None, None, None, None

Avarage Reward:  tensor(1.6697, device='cuda:0')
Action Variance:  tensor(0.0963, device='cuda:0')
Avarage Start Value:  tensor(1.7141, device='cuda:0')
Ratio of Actions taken from policy:  tensor(0.4216, device='cuda:0')


In [None]:
actions.shape

In [12]:
actions=None

In [None]:
env.render()

In [None]:
model_memory

In [None]:
env.reset()

In [None]:
testenv = MultiEnv(BATCH_SIZE, MAX_ITER, device)

In [None]:
cobs = testenv.reset()
cmem, h = model_memory(cobs, None)
v_old = 0
r_sum = 0
for k in range(MAX_ITER):
    action = torch.tensor([k / (MAX_ITER-1)], dtype=torch.float, device=device).expand(BATCH_SIZE, 1)
    cobs, reward, _ = testenv.step(action)
    r_sum += reward.mean()
print(r_sum)

In [None]:
testenv.render()

In [None]:
num_batches = 1000
max_iter = 16
testenv = MultiEnv(BATCH_SIZE, max_iter, device, dims=DIMS)

r_sum = 0
for i in range(num_batches):
    cobs = testenv.reset()
    for k in range(max_iter):
        action = torch.rand([BATCH_SIZE, 2], dtype=torch.float, device=device)
        cobs, reward, _ = testenv.step(action)
        r_sum += reward.mean()
print(r_sum/num_batches)

In [None]:

num_batches = 1000
max_iter = 16
testenv = MultiEnv(BATCH_SIZE, max_iter, device, dims=DIMS)

r_sum = 0
for i in range(num_batches):
    cobs = testenv.reset()
    for p in range(4):
        for k in range(4):
            action = torch.tensor([k/4+1/8, p/4+1/8], dtype=torch.float, device=device).expand(BATCH_SIZE, 2)
            cobs, reward, _ = testenv.step(action)
            r_sum += reward.mean()
print(r_sum/num_batches)

In [28]:
import wip.Code.train_q as train_q
import importlib
importlib.reload(train_q)

<module 'wip.Code.train_q' from '../../wip/Code/train_q.py'>