In [1]:
from mingpt.utils import set_seed
import numpy as np
import scipy as sp
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
from torch.utils.data import Dataset
from mingpt.model import GPT
from mingpt.trainer import Trainer
from mingpt.utils import set_seed, setup_logging, CfgNode as CN

import routegym.env
import networkx as nx

In [2]:
node_number = 12

env = None
while env is None:
    try:
        A = sp.sparse.random(node_number, node_number, density=0.1, format='csr')
        A.data[:] = 1
        A = A.todense()
        A = np.ma.array(A, mask=np.eye(node_number)).filled(fill_value=0).astype(int)
        print("sparsity = %.2f" % (1 - np.sum(A)/A.size))
        G = nx.from_numpy_array(A)
        env = routegym.env.ShortestRouteEnv(G, 0, 5, random_weights=(1,10))
    except:
        pass
# env.render()
print(A)
print(env.graph.adj_mat)
env.get_dijkstra()

sparsity = 0.91
sparsity = 0.91
[[0 0 1 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0]
 [0 1 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 1 0 1 0]]
[[-1 -1  1 -1 -1 -1 -1 -1  1  1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1]
 [ 1 -1 -1 -1 -1  1 -1 -1 -1 -1 -1  1]
 [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1]
 [-1 -1  1 -1 -1 -1 -1  1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1  1]
 [-1  1 -1 -1 -1  1 -1 -1 -1 -1 -1 -1]
 [ 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1  1]
 [ 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
 [-1 -1 -1 -1 -1 -1  1 -1 -1 -1 -1  1]
 [-1 -1  1 -1  1 -1  1 -1  1 -1  1 -1]]


([0, 2, 5], 2)

In [3]:
rew = 0
position = 0
env.reset()
done = False
print("Position: {", end='')
while not done:
    action = np.random.choice(np.arange(0, env.graph.adj_mat.shape[0]))
    position, reward, done, _ = env.step(action)
    # env.render()
    print("%d, " % position, end='')
    rew += reward
print("}\n")
print("Final reward: %.2f" % rew)
env.reset()
done

Position: {0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 0, 2, 11, 11, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 8, 8, 8, 8, 8, 11, 11, 10, 11, 11, 11, 11, 11, 8, 8, 8, 8, 8, 11, 4, 4, 4, 11, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 4, 4, 4, 4, 4, 4, 4, 4, 4, 11, 6, 6, 6, 6, 11, 11, 11, 11, 6, 6, 6, 6, 6, 11, 11, 11, 11, 11, 11, 11, 11, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, }

Final reward: -139.00


True

In [47]:
env.reset()

num_steps = 135_000

def create_dataset():
    obss = []
    actions = []
    returns = [0]
    done_idxs = []
    stepwise_returns = []

    # simulate to create trajectories
    transitions_per_buffer = np.zeros(50, dtype=int)
    num_trajectories = 0
    while len(obss) < num_steps:
        done = False
        while not done:
            ac = np.random.choice(np.arange(0, env.graph.adj_mat.shape[0]))
            state, reward, done, _ = env.step(ac)
            obss += [state]
            actions += [ac]
            stepwise_returns += [reward]
            returns[-1] += reward
        done = False
        env.reset()
        done_idxs += [len(obss)]
        returns += [0]

    actions = np.array(actions)
    returns = np.array(returns)
    stepwise_returns = np.array(stepwise_returns)
    done_idxs = np.array(done_idxs)


    print(len(stepwise_returns))

    # create reward-to-go dataset
    start_index = 0
    rtg = np.zeros_like(stepwise_returns)
    for i in done_idxs:
        i = int(i)
        curr_traj_returns = stepwise_returns[start_index:i]
        for j in range(i-1, start_index-1, -1): # start from i-1
            rtg_j = curr_traj_returns[j-start_index:i-start_index]
            rtg[j] = sum(rtg_j)
        start_index = i
    print('max rtg is %d' % max(rtg))

    print(len(actions))

    # create timestep dataset
    start_index = 0
    timesteps = np.zeros(len(actions)+1, dtype=int)
    for i in done_idxs:
        i = int(i)
        timesteps[start_index:i+1] = np.arange(i+1 - start_index)
        start_index = i+1
    print('max timesteps is %d' % max(timesteps))

    return obss, actions, returns, done_idxs, rtg, timesteps

In [52]:
obss, actions, returns, done_idxs, rtgs, timesteps = create_dataset()
returns.shape

135166
max rtg is 0
135166
max timesteps is 1877


(1077,)

In [13]:
C = CN()

# system
C.system = CN()
C.system.seed = 3407
C.system.work_dir = './out/decgpt'

# data
# C.data = StateActionReturnDataset()
