In [1]:
import os
import sys
import argparse
import time
import datetime
import json
import random
import numpy as np
import torch

from bnp_options import *
from utils import *
from eval import *
from train import *
from env.toy_env import ToyEnv
from env.grid_env import GridEnv
from env.line_env import LineEnv
from env.roboturk_env import RoboturkEnv
from env.atari_env import AtariEnv

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
args = {'K': 1, 'tolerance': 0.1, 'hidden_layer_sizes_policy': [32, 32], 'hidden_layer_sizes_termination': [32, 32], 
        'LSTM_hidden_layer_size': 32, 'LSTM_MLP_hidden_layer_sizes': [32, 32], 'action_space': 'discrete', 
        'learning_rate': 0.001, 'clip': 5.0, 'batch_size': 128, 'max_epochs': 20, 'random_seed': 0, 
        'relaxation_type': 'GS', 'temperature': 1.0, 'temperature_ratio': 0.95, 'env_type': 'atari', 
        'nb_rooms': 5, 'nb_traj': 1000, 'noise_level': 0.0, 'max_steps': 200, 
        'demo_file': '../datasets/atari/seaquest/trajectories.npy', 'baseline': True}

In [4]:
# This will be used to generate the seeds for other RNGs.
random_seed = args['random_seed']
rng_master = np.random.RandomState(random_seed)
np.random.seed(random_seed) # there were some issue with reproducibility
random.seed(random_seed)
torch.manual_seed(random_seed)

env_seed = rng_master.randint(100000)
action_seed = rng_master.randint(100000)
split_seed = rng_master.randint(100000)
rng_env = np.random.RandomState(env_seed)
rng_split = np.random.RandomState(split_seed)

# Environment setup
demo_file = args['demo_file']
max_steps = args['max_steps']
env_name = demo_file.split('/')[-2]
env = AtariEnv(f'{env_name.capitalize()}-ramNoFrameskip-v4', path=demo_file)
data = env.get_expert_trajectories(max_steps=max_steps)
train_data, test_data = split_train_test(data, rng_split)

Loading 1000 trajectories...


In [5]:
sys.path.append('../multilevel_discovery')
from models.GridWorldModelNN import GridWorldNNModel
from models.AtariRamModel import AtariRamModel
from models.AtariVisionModel import AtariVisionModel
from inference.HDQN import NNHDQN

In [6]:
train_data = [[(state.astype("float64"), action.astype("float64"))
                       for state, action in zip(states, actions)]
                      for states, actions in zip(train_data[0], train_data[1])]

In [18]:
model = AtariRamModel(5, statedim=(1024,), actiondim=(18,))

In [8]:
model.train(train_data, 1000, 0)

  np.log(self.P[hp, h]) + \
  Xm = torch.tensor(Xm).float()
  Am = torch.tensor(Am).float()


In [12]:
model.save('checkpoint.pth')

In [22]:
model.load('checkpoint.pth')

In [23]:
_ = env.reset()
for i in range(200):
    obs, reward, done, info = env.step(10)
obs = np.unpackbits(env.env.unwrapped._get_ram())

In [27]:
sub_action, termination = model.play_from_observation(3, obs)

In [28]:
sub_action

13

In [9]:
model.policy_networks

[atariRAMModel(
   (dense1): Linear(in_features=1024, out_features=32, bias=True)
   (dense2): Linear(in_features=32, out_features=32, bias=True)
   (activation): Sigmoid()
   (output): Linear(in_features=32, out_features=18, bias=True)
 ),
 atariRAMModel(
   (dense1): Linear(in_features=1024, out_features=32, bias=True)
   (dense2): Linear(in_features=32, out_features=32, bias=True)
   (activation): Sigmoid()
   (output): Linear(in_features=32, out_features=18, bias=True)
 ),
 atariRAMModel(
   (dense1): Linear(in_features=1024, out_features=32, bias=True)
   (dense2): Linear(in_features=32, out_features=32, bias=True)
   (activation): Sigmoid()
   (output): Linear(in_features=32, out_features=18, bias=True)
 ),
 atariRAMModel(
   (dense1): Linear(in_features=1024, out_features=32, bias=True)
   (dense2): Linear(in_features=32, out_features=32, bias=True)
   (activation): Sigmoid()
   (output): Linear(in_features=32, out_features=18, bias=True)
 ),
 atariRAMModel(
   (dense1): Linear(

In [53]:
score = 0.
# Change data format to fit multilevel discovery
test_data2 = [[(state.astype("float64"), action.astype("float64"))
               for state, action in zip(states, actions)]
              for states, actions in zip(test_data[0], test_data[1])]

all_probs = []
for traj in test_data2:
    traj_probs = []
    for (state, action) in traj:
        max_prob = float(model.evalpi(0, [(state, action)]))
#         print(max_prob)
        for option in range(1, 5):
            max_prob = max(max_prob, float(model.evalpi(option, [(state, action)])))
#             print(float(model.evalpi(option, [(state, action)])))
        traj_probs.append(max_prob)
#         print()
    all_probs.append(traj_probs)
#     print(traj_probs)
print(np.mean(all_probs))

0.34790477349455634


In [29]:
model.policy_networks[0].state_dict()

OrderedDict([('dense1.weight',
              tensor([[-0.0002,  0.0168, -0.0257,  ..., -0.0228, -0.0112,  0.0138],
                      [-0.0022, -0.0295, -0.0180,  ...,  0.0065,  0.0007, -0.0131],
                      [-0.0163, -0.0090, -0.0202,  ...,  0.0063,  0.0249,  0.0172],
                      ...,
                      [ 0.0268, -0.0201,  0.0290,  ...,  0.0025,  0.0172,  0.0145],
                      [-0.0220,  0.0131, -0.0269,  ...,  0.0319, -0.0130,  0.0046],
                      [-0.0001,  0.0221,  0.0296,  ...,  0.0086, -0.0364, -0.0158]])),
             ('dense1.bias',
              tensor([ 0.0072, -0.0152,  0.0193,  0.0197,  0.0359,  0.0387, -0.0053,  0.0299,
                      -0.0049, -0.0296, -0.0018,  0.0074,  0.0084, -0.0402, -0.0058,  0.0079,
                       0.0176, -0.0264, -0.0132, -0.0090,  0.0317, -0.0201, -0.0147, -0.0383,
                       0.0158, -0.0225, -0.0210,  0.0328,  0.0309,  0.0074,  0.0224,  0.0101])),
             ('dense2.weigh

In [28]:
model.policy_networks[1].state_dict()

OrderedDict([('dense1.weight',
              tensor([[-0.0071, -0.0250,  0.0050,  ...,  0.0488,  0.0508, -0.0271],
                      [ 0.0156, -0.0078,  0.0002,  ..., -0.0189, -0.0384,  0.0224],
                      [-0.0161, -0.0152,  0.0037,  ..., -0.0062, -0.0498, -0.0100],
                      ...,
                      [ 0.0194, -0.0074,  0.0109,  ...,  0.0178, -0.0298,  0.0104],
                      [ 0.0125, -0.0283,  0.0254,  ...,  0.0183,  0.0548, -0.0311],
                      [ 0.0296, -0.0015, -0.0129,  ...,  0.0293, -0.0465, -0.0148]])),
             ('dense1.bias',
              tensor([ 0.0068, -0.0265, -0.0081, -0.0047,  0.0193,  0.0115,  0.0463,  0.0077,
                       0.0425, -0.0100,  0.0033, -0.0080,  0.0563, -0.0321, -0.0409,  0.0075,
                      -0.0061,  0.0752, -0.0193, -0.0416, -0.0348,  0.0383,  0.0400, -0.0157,
                       0.0417,  0.0523,  0.0438, -0.0599,  0.0500,  0.0588,  0.0378,  0.0291])),
             ('dense2.weigh

In [46]:
state = test_data2[0][0][0]
action = test_data2[0][0][1]

In [47]:
model.policy_networks[0].forward(torch.tensor(state).float().reshape(1, 1024), torch.tensor(action).float().reshape(1, 18))

tensor([-0.0941], grad_fn=<SumBackward1>)

In [48]:
model.policy_networks[1].forward(torch.tensor(state).float().reshape(1, 1024), torch.tensor(action).float().reshape(1, 18))

tensor([-4.3658], grad_fn=<SumBackward1>)