In [14]:
from mingpt.utils import set_seed
import numpy as np
import scipy as sp
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from torch.utils.data import Dataset, DataLoader
from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN
import time

import routegym.env
import networkx as nx

In [2]:
node_number = 12

env = None
while env is None:
    try:
        A = sp.sparse.random(node_number, node_number, density=0.5, format='csr')
        A.data[:] = 1
        A = A.todense()
        A = np.ma.array(A, mask=np.eye(node_number)).filled(fill_value=0).astype(int)
        print("sparsity = %.2f" % (1 - np.sum(A)/A.size))
        G = nx.from_numpy_array(A)
        env = routegym.env.ShortestRouteEnv(G, 0, 5, random_weights=(1,10))
    except:
        pass
# env.render()
print(A)
print(env.graph.adj_mat)
# print(env.get_dijkstra())

sparsity = 0.53
[[0 1 1 1 1 0 0 0 1 0 1 1]
 [0 0 1 1 0 0 0 0 1 1 0 1]
 [1 0 0 1 1 1 0 0 0 1 0 1]
 [1 1 1 0 0 0 1 1 1 0 1 0]
 [1 1 0 0 0 1 0 1 0 0 0 1]
 [1 1 1 1 1 0 0 0 1 1 1 0]
 [0 1 1 1 1 1 0 1 0 0 1 0]
 [0 0 1 0 1 1 0 0 1 0 1 0]
 [0 1 1 0 0 0 0 1 0 1 0 1]
 [0 1 0 0 0 0 1 1 0 0 1 1]
 [0 0 0 0 1 0 0 0 0 1 0 1]
 [0 1 1 1 0 0 0 1 0 0 0 0]]
[[-1  1  1  1  1  1 -1 -1  1 -1  1  1]
 [ 1 -1  1  1  1  1  1 -1  1  1 -1  1]
 [ 1  1 -1  1  1  1  1  1  1  1 -1  1]
 [ 1  1  1 -1 -1  1  1  1  1 -1  1  1]
 [ 1  1  1 -1 -1  1  1  1 -1 -1  1  1]
 [ 1  1  1  1  1 -1  1  1  1  1  1 -1]
 [-1  1  1  1  1  1 -1  1 -1  1  1 -1]
 [-1 -1  1  1  1  1  1 -1  1  1  1  1]
 [ 1  1  1  1 -1  1 -1  1 -1  1 -1  1]
 [-1  1  1 -1 -1  1  1  1  1 -1  1  1]
 [ 1 -1 -1  1  1  1  1  1 -1  1 -1  1]
 [ 1  1  1  1  1 -1 -1  1  1  1  1 -1]]


In [3]:
for _ in range(30):
    env.reset()

In [4]:
rew = 0
position = 0
env.reset()
done = False
print("Position: {", end='')
while not done:
    action = np.random.choice(np.arange(0, env.graph.adj_mat.shape[0]))
    position, reward, done, _ = env.step(action)
    # env.render()
    print("%d, " % position, end='')
    rew += reward
print("}\n")
print("Final reward: %.2f" % rew)
env.reset()
done

Position: {1, 4, 4, 4, 6, 6, 6, 6, 7, 10, 6, 4, 10, 10, 7, 10, 6, 6, 7, 6, 4, 11, 11, 11, 0, 5, }

Final reward: -25.00


True

In [5]:
env.reset()

num_steps = 10_000

def create_dataset():
    obss = []
    actions = []
    returns = [0]
    done_idxs = []
    stepwise_returns = []

    # simulate to create trajectories
    transitions_per_buffer = np.zeros(50, dtype=int)
    num_trajectories = 0
    while len(obss) < num_steps:
        done = False
        for _ in range(10):
            ac = np.random.choice(np.arange(0, env.graph.adj_mat.shape[0]))
            state, reward, done, _ = env.step(ac)
            obss += [state]
            actions += [ac]
            stepwise_returns += [reward]
            returns[-1] += reward
        # done = False
        env.reset()
        done_idxs += [len(obss)]
        returns += [0]

    actions = np.array(actions)
    returns = np.array(returns)
    stepwise_returns = np.array(stepwise_returns)
    done_idxs = np.array(done_idxs)

    # create reward-to-go dataset
    start_index = 0
    rtg = np.zeros_like(stepwise_returns)
    for i in done_idxs:
        i = int(i)
        curr_traj_returns = stepwise_returns[start_index:i]
        for j in range(i-1, start_index-1, -1): # start from i-1
            rtg_j = curr_traj_returns[j-start_index:i-start_index]
            rtg[j] = sum(rtg_j)
        start_index = i
    print('max rtg is %d' % max(rtg))

    # create timestep dataset
    start_index = 0
    timesteps = np.zeros(len(actions)+1, dtype=int)
    for i in done_idxs:
        i = int(i)
        timesteps[start_index:i+1] = np.arange(i+1 - start_index)
        start_index = i+1
    print('max timesteps is %d' % max(timesteps))

    return obss, actions, returns, done_idxs, rtg, timesteps

In [6]:
obss, actions, returns, done_idxs, rtgs, timesteps = create_dataset()
print(returns[:-1].max())
timesteps[25:41]

max rtg is 0
max timesteps is 10
-6.0


array([4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
class StateActionReturnDataset(Dataset):

    @staticmethod
    def get_default_config():
        C = CN()
        C.block_size = 10 * 3
        return C

    def __init__(self, data, block_size, actions, done_idxs, rtgs, timesteps):
        self.block_size = block_size
        self.vocab_size = max(actions) + 1
        self.data = data
        self.actions = actions
        self.done_idxs = done_idxs
        self.rtgs = rtgs
        self.timesteps = timesteps

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        block_size = self.block_size // 3
        done_idx = idx + block_size
        for i in self.done_idxs:
            if i > idx: # first done_idx is greater than idx
                done_idx = min(int(i), done_idx)
                break
        idx = done_idx - block_size
        states = torch.tensor(np.array(self.data[idx:done_idx]), dtype=torch.float32).reshape(block_size, -1) # (block_size, state_dim)
        actions = torch.tensor(self.actions[idx:done_idx], dtype=torch.long).unsqueeze(1) # (block_size, 1)
        rtgs = torch.tensor(self.rtgs[idx:done_idx], dtype=torch.float32).unsqueeze(1) # (block_size, 1)
        timesteps = torch.tensor(self.timesteps[idx:idx+1], dtype=torch.int64).unsqueeze(1) # (block_size, 1)

        return states, actions, rtgs, timesteps

In [8]:
C = CN()

# system
C.system = CN()
C.system.seed = 3407
C.system.work_dir = './out/decgpt'

# data
C.data = StateActionReturnDataset.get_default_config()

# model 
C.model = GPT.get_default_config()
C.model.model_type = 'gpt-mini'

# trainer
C.trainer = Trainer.get_default_config()
C.trainer.learning_rate = 5e-4

In [9]:
train_dataset = StateActionReturnDataset(obss, 10 * 3, actions, done_idxs, rtgs, timesteps)


In [10]:

C.model.vocab_size = train_dataset.vocab_size
C.model.block_size = train_dataset.block_size
C.model.max_timestep = max(timesteps)
C.model.max_timestep

10

In [11]:
model = GPT(C.model)

number of parameters: 2.68M


In [12]:
x, y, r, t = train_dataset[np.random.randint(0, 10_000, 1)]
t

tensor([[9]])

In [1]:
loader = DataLoader(train_dataset, shuffle=True, pin_memory=True, batch_size=4)

SyntaxError: unmatched ')' (642224348.py, line 1)

In [13]:
model(x, y, y, r, t)

torch.Size([10, 192])


RuntimeError: The expanded size of the tensor (1) must match the existing size (10) at non-singleton dimension 1.  Target sizes: [10, 1, 192].  Tensor sizes: [10, 192]