In [1]:
from mingpt.utils import set_seed
import numpy as np
import scipy as sp
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from torch.utils.data import Dataset, DataLoader
from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN
import time

import routegym.env
import networkx as nx

In [2]:
node_number = 12

env = None
while env is None:
    try:
        A = sp.sparse.random(node_number, node_number, density=0.5, format='csr')
        A.data[:] = 1
        A = A.todense()
        A = np.ma.array(A, mask=np.eye(node_number)).filled(fill_value=0).astype(int)
        print("sparsity = %.2f" % (1 - np.sum(A)/A.size))
        G = nx.from_numpy_array(A)
        env = routegym.env.ShortestRouteEnv(G, 0, 5, random_weights=(1,10))
    except:
        pass
# env.render()
print(A)
print(env.graph.adj_mat)
# print(env.get_dijkstra())

sparsity = 0.56
[[0 0 1 1 0 0 0 1 1 0 0 0]
 [1 0 0 0 0 1 0 0 1 0 1 0]
 [1 1 0 1 0 1 0 1 1 1 1 1]
 [0 1 1 0 0 1 0 0 0 1 1 0]
 [1 1 1 0 0 1 0 1 0 1 1 0]
 [0 0 0 1 1 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 1 1]
 [1 1 0 0 1 1 0 0 1 0 0 0]
 [1 1 1 0 1 0 1 0 0 0 0 1]
 [1 0 1 0 1 0 0 0 0 0 0 0]
 [1 0 1 0 1 0 1 0 1 0 0 1]
 [1 1 1 1 1 0 1 1 1 0 1 0]]
[[-1  1  1  1  1 -1 -1  1  1  1  1  1]
 [ 1 -1  1  1  1  1  1  1  1 -1  1  1]
 [ 1  1 -1  1  1  1 -1  1  1  1  1  1]
 [ 1  1  1 -1 -1  1 -1 -1 -1  1  1  1]
 [ 1  1  1 -1 -1  1  1  1  1  1  1  1]
 [-1  1  1  1  1 -1 -1  1 -1 -1 -1 -1]
 [-1  1 -1 -1  1 -1 -1 -1  1 -1  1  1]
 [ 1  1  1 -1  1  1 -1 -1  1 -1 -1  1]
 [ 1  1  1 -1  1 -1  1  1 -1 -1  1  1]
 [ 1 -1  1  1  1 -1 -1 -1 -1 -1 -1 -1]
 [ 1  1  1  1  1 -1  1 -1  1 -1 -1  1]
 [ 1  1  1  1  1 -1  1  1  1 -1  1 -1]]


In [3]:
for _ in range(30):
    env.reset()

In [4]:
rew = 0
position = 0
env.reset()
done = False
print("Position: {", end='')
while not done:
    action = np.random.choice(np.arange(0, env.graph.adj_mat.shape[0]))
    position, reward, done, _ = env.step(action)
    # env.render()
    print("%d, " % position, end='')
    rew += reward
print("}\n")
print("Final reward: %.2f" % rew)
env.reset()
done

Position: {10, 8, 4, 8, 11, 6, 6, 6, 4, 5, }

Final reward: -9.00


True

In [5]:
env.reset()

num_steps = 10_000

def create_dataset():
    obss = []
    actions = []
    returns = [0]
    done_idxs = []
    stepwise_returns = []

    # simulate to create trajectories
    transitions_per_buffer = np.zeros(50, dtype=int)
    num_trajectories = 0
    while len(obss) < num_steps:
        done = False
        for _ in range(10):
            ac = np.random.choice(np.arange(0, env.graph.adj_mat.shape[0]))
            state, reward, done, _ = env.step(ac)
            obss += [state]
            actions += [ac]
            stepwise_returns += [reward]
            returns[-1] += reward
        # done = False
        env.reset()
        done_idxs += [len(obss)]
        returns += [0]

    actions = np.array(actions)
    returns = np.array(returns)
    stepwise_returns = np.array(stepwise_returns)
    done_idxs = np.array(done_idxs)

    # create reward-to-go dataset
    start_index = 0
    rtg = np.zeros_like(stepwise_returns)
    for i in done_idxs:
        i = int(i)
        curr_traj_returns = stepwise_returns[start_index:i]
        for j in range(i-1, start_index-1, -1): # start from i-1
            rtg_j = curr_traj_returns[j-start_index:i-start_index]
            rtg[j] = sum(rtg_j)
        start_index = i
    print('max rtg is %d' % max(rtg))

    # create timestep dataset
    start_index = 0
    timesteps = np.zeros(len(actions)+1, dtype=int)
    for i in done_idxs:
        i = int(i)
        timesteps[start_index:i+1] = np.arange(i+1 - start_index)
        start_index = i+1
    print('max timesteps is %d' % max(timesteps))

    return obss, actions, returns, done_idxs, rtg, timesteps

In [6]:
obss, actions, returns, done_idxs, rtgs, timesteps = create_dataset()
print(returns[:-1].max())
timesteps[25:41]

max rtg is 0
max timesteps is 10
-7.0


array([4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
class StateActionReturnDataset(Dataset):

    @staticmethod
    def get_default_config():
        C = CN()
        C.block_size = 10 * 3
        return C

    def __init__(self, data, block_size, actions, done_idxs, rtgs, timesteps):
        self.block_size = block_size
        self.vocab_size = max(actions) + 1
        self.data = data
        self.actions = actions
        self.done_idxs = done_idxs
        self.rtgs = rtgs
        self.timesteps = timesteps

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        block_size = self.block_size // 3
        done_idx = idx + block_size
        for i in self.done_idxs:
            if i > idx: # first done_idx is greater than idx
                done_idx = min(int(i), done_idx)
                break
        idx = done_idx - block_size
        states = torch.tensor(np.array(self.data[idx:done_idx]), dtype=torch.float32).reshape(block_size, -1) # (block_size, state_dim)
        actions = torch.tensor(self.actions[idx:done_idx], dtype=torch.long).unsqueeze(1) # (block_size, 1)
        rtgs = torch.tensor(self.rtgs[idx:done_idx], dtype=torch.float32).unsqueeze(1) # (block_size, 1)
        timesteps = torch.tensor(self.timesteps[idx:idx+1], dtype=torch.int64).unsqueeze(1) # (block_size, 1)

        return states, actions, rtgs, timesteps

In [8]:
C = CN()

# system
C.system = CN()
C.system.seed = 3407
C.system.work_dir = './out/decgpt'

# data
C.data = StateActionReturnDataset.get_default_config()

# model 
C.model = GPT.get_default_config()
C.model.model_type = 'gpt-mini'

# trainer
C.trainer = Trainer.get_default_config()
C.trainer.learning_rate = 5e-4
C.num_workers = 0

In [9]:
train_dataset = StateActionReturnDataset(obss, 10 * 3, actions, done_idxs, rtgs, timesteps)


In [10]:
C.model.vocab_size = train_dataset.vocab_size
C.model.block_size = train_dataset.block_size
C.model.max_timestep = max(timesteps)
C.model.max_timestep

10

In [11]:
model = GPT(C.model)

number of parameters: 2.68M


In [12]:
loader = DataLoader(train_dataset, shuffle=True, pin_memory=True, batch_size=4)

In [13]:
for _ in range(121):
    x, y, r, t = next(iter(loader))

In [14]:
x, y, r, t = next(iter(loader))

In [15]:
model(x, y, y, r, t)

(tensor([[[ 3.8666e-01,  2.1245e-01, -3.4453e-01,  4.4208e-01, -5.8358e-02,
           -2.9310e-01, -2.4434e-01,  3.8065e-01,  2.6474e-02,  4.3546e-03,
            5.3917e-01, -4.1469e-01],
          [ 5.5075e-02,  1.4508e-01, -1.7453e-01,  3.5379e-01, -1.7336e-02,
           -7.8334e-02, -7.2987e-02,  1.1662e-01, -1.3516e-01,  1.5920e-02,
            9.6956e-02, -2.7739e-01],
          [ 1.7238e-01,  1.7261e-01, -1.4925e-01,  5.2335e-01, -7.6339e-02,
           -3.5101e-01, -1.5231e-01,  3.2233e-01,  1.4798e-02,  5.9480e-02,
            5.2817e-01, -1.9191e-01],
          [ 1.4687e-01,  4.4773e-02, -3.4282e-01,  5.4251e-01, -1.3046e-01,
           -3.0370e-01, -1.5873e-01,  4.4630e-01, -3.3291e-01, -2.7482e-01,
            5.3937e-01, -4.4995e-01],
          [-2.9000e-02,  1.3836e-01, -2.4196e-01,  4.3282e-01, -2.7540e-01,
           -3.6383e-01, -1.4357e-01,  2.5607e-01, -1.4950e-01, -3.7607e-02,
            3.2920e-01, -2.4455e-01],
          [ 9.7448e-02,  4.7449e-02, -2.3938e-01, 

In [16]:
trainer = Trainer(model, train_dataset, C.trainer)

running on device cuda


In [17]:
trainer.run()

  self.pid = os.fork()
  self.pid = os.fork()
epoch 1 iter 155: train loss 1.02512. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 61.34it/s]
epoch 2 iter 155: train loss 1.07145. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 66.63it/s]
epoch 3 iter 155: train loss 0.84732. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 66.63it/s]
epoch 4 iter 155: train loss 0.83736. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 66.95it/s]
epoch 5 iter 155: train loss 0.90317. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 66.39it/s]
epoch 6 iter 155: train loss 0.66238. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 66.33it/s]
epoch 7 iter 155: train loss 0.59755. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 66.55it/s]
epoch 8 iter 155: train loss 0.58859. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 66.36it/s]
epoch 9 iter 155: train loss 0.58411. lr 5.000000e-04: 100%|██████████| 156/156 [00:02<00:00, 66.43it/s]
epoch 10 