This is an implementation of PPO-clip for path selection for symbolic execution.
Each epoch we communicate with jar-file for data gathering and wandb for logging.

### Imports, meta

In [None]:
# %%capture

from IPython.display import Javascript
def resize_colab_cell():
  display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 600})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)

import numpy as np
from numpy import random
import copy
import inspect
import torch
from torch import nn
import torch.onnx
import json
from tqdm import tqdm, trange
from time import time
import os
import sklearn
from sklearn import tree
import math

# !pip install wandb
import wandb

# !pip install onnx==1.12
# import onnx

with open('../Game_env/jar_config.txt', 'w') as jar_config:
    jar_config.write(json.dumps({"algorithm": "PPO"}))

### Args (potentially immutable), login

In [None]:
# %%capture
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Currently logged in as: [33mandrey_podivilov[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
batch_size = 4096
device = 'cuda' if torch.cuda.is_available() else 'cpu'
td_gamma=0.99
json_path = '../Data/current_dataset.json'
use_fork_discount = False

jar_command = '/home/st-andrey-podivilov/java16/usr/lib/jvm/bellsoft-java16-amd64/bin/java -jar ../Game_env/usvm-jvm/build/libs/usvm-jvm-new.jar > ../Game_env/jar_log.txt'

device

<IPython.core.display.Javascript object>

'cuda'

### Models, modules

In [None]:
class FFM_layer(torch.nn.Module):
    """
    Why Not?
    """
    def __init__(self, input_dim):
      super().__init__()
      assert input_dim%2 == 0, 'even input_dim is more convenient'
      self.fourier_matrix = torch.nn.Linear(input_dim, int(input_dim), bias=False)
      nn.init.normal_(
          self.fourier_matrix.weight,
          std=1/np.sqrt(input_dim),
      )
      self.fourier_matrix.weight.requires_grad_(False)

    def forward(self, x):
      pre = x # self.fourier_matrix(x)
      s = torch.sin(pre)
      c = torch.cos(pre)
      return torch.cat([x,s,c], dim=-1)

def get_model_setup(use_FFM=False,
                    lr = 3e-4,
                    wd=0.1,
                    ):
    mlp = nn.Sequential(
        nn.LazyLinear(512),
        nn.ReLU(),
        nn.Linear(512,256),
        nn.LayerNorm(256),
        FFM_layer(256) if use_FFM else nn.Identity(),
        nn.LazyLinear(1024),
        nn.ReLU(),
        nn.Linear(1024,1024),
        nn.ReLU(),
        nn.Linear(1024,1),
    ).to(device)
    mlp_opt = torch.optim.AdamW(mlp.parameters(), lr=lr, weight_decay=0.1, betas=(0.9, 0.99))
    return mlp, mlp_opt

# to check features' strength
r_tree = tree.DecisionTreeRegressor(max_depth=1000, )

<IPython.core.display.Javascript object>

### Data

In [None]:
class Trajectories:
  """
  Contains all kinds of data in a form of tensor.
  realized_tensors are raw and derivative features of visited states.
  queues is a list of each states' actions features.
  Action and state embeddings are effectively the same, fyi.
  """
  def __init__(self,
               path=json_path,
               td_gamma=td_gamma,
               eval_condition = (lambda x: x%5==0),
              ):
    self.eval_condition = eval_condition
    self.td_gamma = td_gamma
    self.j_file = json.load(open(path))
    self.feature_names = self.j_file['scheme'][0]
    self.feature_names2ids = {self.feature_names[i]:i for i in range(len(self.feature_names))}
    self.realized_tensors, self.queues = self.j2torch(self.j_file) #list of 5 tensors f, f_n, r, R, is_last + list of queue tensors
    self.n_states = self.n_states()


  def j2torch(self, j_file):
    """
    transforms json to data tensors
    """
    features, features_next, rewards, Returns, is_last, queues = [], [], [], [], [], []
    chosenStId_idx = self.j_file['scheme'].index('chosenStateId')
    rewards_idx = self.j_file['scheme'].index('reward')

    for tr in self.j_file['paths']:
      if self.eval_condition(tr[0]):
        continue
      tr = tr[1]
      tr_rewards = [tr[i][rewards_idx] for i in range(len(tr))]
      rewards += tr_rewards
      do_discount = torch.Tensor([1]*len(tr))
      if use_fork_discount:
        is_cfg_fork_idx = self.j_file['scheme'].index('is_cfg_fork')
        do_discount = [tr[i][is_cfg_fork_idx] for i in range(len(tr))]
      tr_Returns = self.tr_rewards_to_returns(tr_rewards, do_discount)
      Returns += tr_Returns

      tr_features = [tr[i][0][tr[i][chosenStId_idx]] for i in range(len(tr))]
      is_last += [0]*(len(tr_features)-1) + [1]
      features += tr_features
      features_next += tr_features[1:] + [[-1]*len(tr_features[0])]

      tr_queues = [torch.Tensor(tr[i][0]) for i in range(len(tr))][1:] + [torch.zeros_like(torch.Tensor([tr[0][0][0]]))]
      queues += tr_queues
    rewards = torch.Tensor(rewards).to(device)
    features = torch.Tensor(features).to(device)
    features_next = torch.Tensor(features_next).to(device)
    Returns = torch.Tensor(Returns).to(device)
    is_last = torch.Tensor(is_last).to(device)
    return [features, features_next, rewards, Returns, is_last], queues

  def n_states(self):
    return len(self.realized_tensors[-1])

  def get_properties(self):
    longest_queue_ids = np.argmax(np.array([q.shape[0] for q in self.queues]))
    prop = {
        'queue max length, idx': (self.queues[longest_queue_ids].shape[0], longest_queue_ids),
        'total number of states': len(self.realized_tensors[-1]),
        'number of traj-s': len(self.j_file['paths']),
        'number of validation traj-s': sum([self.eval_condition(tr[0]) for tr in self.j_file['paths']]),
        }
    # assert self.queues[longest_queue_ids].shape[0] < batch_size/5, 'not that i.i.d sampling, better rewrite?'
    return prop

  def tr_rewards_to_returns(self, tr_rewards, do_discount):
    tr_R = [0]*(len(tr_rewards)-1) + [tr_rewards[-1]]
    for i in range(len(tr_rewards)-2, -1, -1):
        tr_R[i] = tr_rewards[i] + (self.td_gamma**do_discount[i]) * tr_R[i+1]
    return tr_R

  def sample_batch(self, batch_size=batch_size):
    """
    Queues are diverse in length, but we want them in one batch still.
    """
    ids_to_try = torch.tensor(random.choice(self.n_states, size=batch_size)).long()
    ids = []
    sampled_queues = []
    bins = [0]
    accum_len = 0
    # naive
    for idx in ids_to_try:
      cur_len = self.queues[idx.item()].shape[0]
      if accum_len + cur_len > batch_size:
        break
      accum_len += cur_len
      bins += [accum_len]
      ids += [idx]
      sampled_queues += [self.queues[idx.item()]]
    ids = torch.LongTensor(ids).to(device)
    bins = torch.LongTensor(bins).to(device)
    sampled_queues = torch.cat(sampled_queues).to(device)

    sampled_realized = [t[ids] for t in self.realized_tensors]
    return *sampled_realized, sampled_queues, bins

  def update_data_on_path(self, path, model):
    """
    Communication with jar file on a server.
    """
    x = torch.randn(1, self.realized_tensors[0][0].shape[0], requires_grad=True).to(device)
    torch_model = model.eval()
    torch_out = torch_model(x)
    torch.onnx.export(torch_model,
                      x,
                      '../Game_env/model.onnx',
                      opset_version=13,
                      export_params=True,
                      input_names = ['input'],   # the model's input names
                      output_names = ['output'],
                      dynamic_axes={'input' : {0 : 'batch_size'},    # variable length axes
                                    'output' : {0 : 'batch_size'},
                                    },
                      )

    os.system(jar_command)

  def evaluate_data(self,
           factors=torch.Tensor([1, 0.99, 0.95]),
           eval_condition = None,
           verbose=True,
           wandb_prefix = 'val',
           ):
    if eval_condition is None:
        eval_condition = self.eval_condition
    rewards_idx = self.j_file['scheme'].index('reward')
    for f in factors:
      size = 0
      tr_lengths = []
      trs_R = []
      for tr in self.j_file['paths']:
        if not eval_condition(tr[0]):
          continue
        tr=tr[1]
        size += len(tr)
        tr_lengths += [len(tr)]
        tr_rewards = [tr[i][rewards_idx] for i in range(len(tr))]
        trs_R += [0]
        do_discount = torch.Tensor([1]*len(tr))
        if use_fork_discount:
          is_cfg_fork_idx = self.j_file['scheme'].index('is_cfg_fork')
          do_discount = [tr[i][is_cfg_fork_idx] for i in range(len(tr))]
        for i in range(len(tr_rewards)-1, -1, -1):
          trs_R[-1] = tr_rewards[i] + (f ** do_discount[i]) * trs_R[-1]
      log = {}
      log[f'{wandb_prefix} size'] = size
      log[f'{wandb_prefix}_eval/mean {f:.2f} discount '] = torch.Tensor(trs_R).mean()
      log[f'{wandb_prefix}_eval/median {f:.2f} discount '] = torch.Tensor(trs_R).median()
      # log[f'{wandb_prefix}_eval/95_quanile {f:.2f} discount'] = torch.Tensor(trs_R).quantile(q=0.95)
      log[f'{wandb_prefix} Return by trjs {f:.2f} hist (previous epoch)'] = wandb.Histogram(np_histogram=np.histogram(trs_R, bins=40, ))
#       log[f'{wandb_prefix} lengths hist'] = wandb.Histogram(np_histogram=np.histogram(tr_lengths, bins=30, ))
      if verbose:
          wandb.log(log.copy())
      log['Returns'] = trs_R
      log[f'{wandb_prefix} lengths'] = tr_lengths
    return log

<IPython.core.display.Javascript object>

### Logger

In [None]:
class Logger:
  """
  Supporting class, to be expanded.
  Stores logging methods and relevant data.
  """
  def __init__(
      self,
      NN_setup,
      batch_size=batch_size,
      between_logs = 10,
  ):
    self.actor = NN_setup['actor']
    self.actor_opt = NN_setup['actor_opt']
    self.critic = NN_setup['critic']
    self.critic_opt = NN_setup['critic_opt']
    self.grad_a = None
    self.grad_c = None
    self.weight_a = None
    self.weight_c = None
    # self.running_grad_mean = torch.zeros(len([p for p in self.model.parameters() if p.requires_grad])).to(device)
    # self.running_grad2_mean = torch.zeros(len([p for p in self.model.parameters() if p.requires_grad])).to(device)
    self.log_gamma = torch.tensor(0.95).to(device)
    self.between_logs = between_logs
    self.timer = {}

  @torch.no_grad()
  def link_models(self,):
    self.grad_a = [p.grad.detach() for p in self.actor.parameters() if p.requires_grad]
    self.weight_a = [p.detach() for p in self.actor.parameters()]
    self.grad_c = [p.grad.detach() for p in self.critic.parameters() if p.requires_grad]
    self.weight_c = [p.detach() for p in self.critic.parameters()]

  @torch.no_grad()
  def list_norm(self, l, p=2):
    n = 0
    for t in l:
      n += t.detach().norm(p) ** p
    return n.item() ** (1/p)

  @torch.no_grad()
  def list_cos_dist(self, a, b):
    a_norm = self.list_norm(a, 2)
    b_norm = self.list_norm(b, 2)
    product = sum([torch.dot(torch.flatten(a[i]), torch.flatten(b[i])).item() for i in range(len(a))])
    return product/(a_norm*b_norm)

  @torch.no_grad()
  def on_list(self, a, b, operation):
    assert len(a) == len(b), 'lists lengths differ'
    return [operation(a[i], b[i]) for i in range(len(a))]

  @torch.no_grad()
  def running_mean(self, a, b):
    return [a[i].mul(self.log_gamma) + b[i].mul(1 - self.log_gamma) for i in range(len(a))]

  # @torch.no_grad()
  # def step(self):
  #   self.running_grad_mean = self.running_mean(self.running_grad_mean, self.grad)
  #   self.running_grad2_mean = self.running_mean(self.running_grad2_mean, [g**2 for g in self.grad])

  # @torch.no_grad()
  # def grad_stdev(self):
  #   dev = [self.running_grad2_mean[i] - m**2 for i, m in enumerate(self.running_grad_mean)]
  #   return [torch.sqrt(torch.maximum(torch.tensor(0), d)) for d in dev]

<IPython.core.display.Javascript object>

### Trainer


In [None]:
class NN_Trainer:
  def __init__(
      self,
      NN_setup,
      logger=None,
      trajectories=None,
      batch_size=batch_size,
      n_batches=1000,
      target_update_steps = 20,
      td_gamma=td_gamma,
      ):
    self.n_batches = n_batches
    self.batch_number = -1
    self.td_gamma = td_gamma
    self.clip_eps = 2e-1
    self.actor = NN_setup['actor'].train()
    self.actor_opt = NN_setup['actor_opt']
    self.prev_actor = copy.deepcopy(self.actor).eval()
    self.critic = NN_setup['critic'].train()
    self.target_critic = copy.deepcopy(self.critic).eval()
    self.critic_opt = NN_setup['critic_opt']
    self.trajectories = trajectories
    self.batch_size = batch_size
    self.target_update_steps = target_update_steps
    self.logger = logger
    self.log = {}

  def get_each_loss(self,
               features,
               features_next,
               rewards,
               Returns,
               is_last,
               queues,
               bins,
    ):
    """
    Computes losses for actor, critic and exploration (loss_ent) within PPO algorithm.
    Decisions were made to avoid python loops at all costs --
    varying action space is not particularly batch-friendly.
    """
    logger = self.logger
    self.log['Returns mean'] = Returns.mean().item()
    self.log['Return std'] = Returns.std().item()
    self.log['rewards mean'] = rewards.mean().item()

    t_logits = time()
    logits = self.actor(queues).squeeze()
    with torch.no_grad():
      prev_logits = self.prev_actor(queues).squeeze() # can do once in epoch
    logger.timer['logits'] += time()-t_logits

    t_values = time()
    values = self.critic(features).squeeze()
    with torch.no_grad():
      next_values = self.target_critic(features_next).squeeze()
    logger.timer['values'] += time()-t_values

    t_logits_chosen = time()
    logits_chosen = self.actor(features_next).squeeze() # can export
    with torch.no_grad():
      prev_logits_chosen = self.prev_actor(features_next).squeeze() # can export
    logger.timer['logits chosen'] += time()-t_logits_chosen

    self.log['V-func mean'] = torch.mean(values.detach()).item()
    self.log['V-func stdev'] = torch.std(values.detach()).item()
    hist = wandb.Histogram(np_histogram=np.histogram(values.detach().to('cpu'), bins=40, ))
    self.log['V-func hist'] = hist

    # critic loss
    TD = values - (rewards + next_values * self.td_gamma * (1-is_last))
    MC = (values - Returns).abs().mean()/10
    loss_c = (TD**2).mean() # + MC

    self.log['TD loss'] = (TD**2).mean().item()
    self.log['MC loss'] = MC.item()
    # hist = wandb.Histogram(np_histogram=np.histogram(TD.detach().to('cpu'), bins=20, ))
    # self.log['TD hist'] = hist

    t_splitting = time()
    split_sizes = list(bins[1:]-bins[:-1])
    logits_by_state = torch.split(logits, split_sizes)
    prev_logits_by_state = torch.split(prev_logits, split_sizes)
    logger.timer['splitting'] += time()-t_splitting

    t_padding = time()
    logits_pad = torch.nn.utils.rnn.pad_sequence(logits_by_state,
                                                 padding_value=-float('inf'),
                                                 batch_first=True)
    prev_logits_pad = torch.nn.utils.rnn.pad_sequence(prev_logits_by_state,
                                                      padding_value=-float('inf'),
                                                      batch_first=True)
    logger.timer['padding'] += time()-t_padding


    # entropy loss
    t_entropy_loss = time()

    probs_by_state = nn.functional.softmax(logits_pad, dim=-1)
    entropies = - probs_by_state * torch.log(torch.max(torch.tensor(1e-40), probs_by_state))
    entropy_by_state_reg = torch.sum(entropies, dim=-1) / torch.log(torch.hstack(split_sizes)+1).to(device)

    loss_ent = -entropy_by_state_reg.mean()
    logger.timer['entropy loss'] += time()-t_entropy_loss

    # actor loss
    t_actor_loss = time()

    prev_probs_by_state = nn.functional.softmax(prev_logits_pad, dim=-1)
    logsexp = torch.logsumexp(logits_pad, dim=-1)
    prev_logsexp = torch.logsumexp(prev_logits_pad, dim=-1)
    probs_chosen = (logits_chosen - logsexp).exp()
    prev_probs_chosen = (prev_logits_chosen - prev_logsexp).exp()

    ratios = (probs_chosen / (prev_probs_chosen.detach()+1e-9)).to(device)
    clipped = torch.clip(ratios, min=1-self.clip_eps, max=1+self.clip_eps)
    Adv = - (TD * (1-is_last)).detach()
    loss_a = - torch.min(ratios*Adv, clipped*Adv).mean()
    logger.timer['actor loss'] += time()-t_actor_loss

    return loss_a, loss_c, loss_ent



  def learn_new_policy(self, ):
    """
    Implements one learning cycle over collected dataset.
    """
    self.prev_actor = copy.deepcopy(self.actor).eval()
    logger=self.logger
    logger.timer = {'logits': 0,
                    'values': 0,
                    'logits chosen': 0,
                    'entropy loss': 0,
                    'actor loss': 0,
                    'total loss': 0,
                    'optimizers step': 0,
                    'optimizers no step': 0,
                    'slice (x2)': 0,
                    'logsexp': 0,
                    'sample batch': 0,
                    'splitting': 0,
                    'padding': 0,
                    }
    for i in trange(self.n_batches):
      self.batch_number = i
      if self.batch_number % self.target_update_steps == 0:
        self.target_critic = copy.deepcopy(self.critic).eval()
      t_total_loss = time()

      t_sample_batch = time()
      sampled_batch = self.trajectories.sample_batch(self.batch_size)
      logger.timer['sample batch'] += time() - t_sample_batch

      loss_a, loss_c, loss_ent = self.get_each_loss(*sampled_batch)
      loss = loss_a + loss_c + loss_ent/50
      logger.timer['total loss'] += time()-t_total_loss

      self.critic_opt.zero_grad()
      self.actor_opt.zero_grad()

      t_optimizers_no_step = time()
      loss.backward()
      logger.timer['optimizers no step'] += time()-t_optimizers_no_step

      torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 20)
      torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 20)
      t_optimizers_step = time()
      self.actor_opt.step()
      self.critic_opt.step()
      logger.timer['optimizers step'] += time()-t_optimizers_step

      logger.link_models()

      if self.batch_number % (logger.between_logs+1) == 0:
        self.log.update({
            'loss actor': loss_a.item(),
            'loss critic': loss_c.item(),
            'entropy by log(n)': loss_ent.item(),
            'grad actor L2': logger.list_norm(logger.grad_a, 2),
            'grad critic L2': logger.list_norm(logger.grad_c, 2),
            'weight actor L2': logger.list_norm(logger.weight_a, 2),
            'weight critic L2': logger.list_norm(logger.weight_c, 2),
            # 'some_feature_mean': features[:, self.trajectories.feature_names2ids['some_feature']].mean()
        })
        wandb.log(self.log)
        self.log = {}

<IPython.core.display.Javascript object>

### Procedure

In [None]:
epochs = 30
eval_conditions=[(lambda x: x%5==0)]
td_gammas = [0.99]


for (eval_condition, td_gamma) in zip(eval_conditions, td_gammas):
    run = wandb.init(
          project="PS PPO",
          name=f'val {inspect.getsourcelines(eval_condition)[0][0].split("x")[-1].split(")")[0]}',
          config={
              'algorithm': 'PPO-clip',
              'models': 'mlp',
          }
    )

    # first we evaluate BFS heuristic (not to be confused with naive BFS)
    # gather dataset
    time_before = time()
    os.system('rm -f ../Game_env/model.onnx')
    os.system(jar_command)
    print('BFS data gathering time:', time() - time_before)

    trajectories = Trajectories(json_path,
                                eval_condition=eval_condition,
                                td_gamma=td_gamma,
                                )
    trajectories.evaluate_data()
    trajectories.evaluate_data(wandb_prefix='train',
                               eval_condition=(lambda x: not trajectories.eval_condition(x)),
                               )

    # then collect new json data file using randomly initialized policy neural network
    actor, actor_opt = get_model_setup(use_FFM=True, wd=0.001)
    critic, critic_opt = get_model_setup(use_FFM=True,)
    logger = Logger(NN_setup={'actor': actor, 'actor_opt': actor_opt,
                              'critic': critic, 'critic_opt': critic_opt,},
                    batch_size=batch_size,
                    between_logs = 50,
                    )
    trainer = NN_Trainer(NN_setup={'actor': actor, 'actor_opt': actor_opt,
                                    'critic': critic, 'critic_opt': critic_opt,},
                          logger=logger,
                          trajectories=trajectories,
                          batch_size=batch_size,
                          )
    trajectories.update_data_on_path(path='../Data/current_dataset.json', model=trainer.actor)


    for epoch in range(epochs):
        # update dataset
        trajectories = Trajectories(json_path,
                                    eval_condition=eval_condition,
                                    td_gamma=td_gamma,
                                   )
        print(trajectories.get_properties())
        logger = Logger(NN_setup={'actor': actor, 'actor_opt': actor_opt,
                                  'critic': critic, 'critic_opt': critic_opt,},
                        batch_size=batch_size,
                        between_logs = 50,
                        )
        trainer = NN_Trainer(NN_setup={'actor': actor, 'actor_opt': actor_opt,
                                       'critic': critic, 'critic_opt': critic_opt,},
                             logger=logger,
                             trajectories=trajectories,
                             batch_size=batch_size,
                             n_batches=1000,
                             td_gamma=td_gamma,
                             )

        trajectories.evaluate_data()
        trajectories.evaluate_data(wandb_prefix='train',
                                   eval_condition=(lambda x: not trajectories.eval_condition(x)),
                                   )
        trainer.learn_new_policy()
        print(logger.timer)

        wandb.log({'epoch': epoch,
                })

        time_before = time()
        trajectories.update_data_on_path(path='../Data/current_dataset.json', model=trainer.actor)
        print('Data gathering time: ', time()-time_before)

    trajectories.update_data_on_path(path='../Data/current_dataset.json', model=trainer.actor)
    trajectories.evaluate_data()

    checkpoint = {
    'actor': actor,
    'critic':critic,
    }
    # torch.save(checkpoint, os.path.join(wandb.run.dir, f'mlp for TD multistep'))
    wandb.finish()

<IPython.core.display.Javascript object>



verbose: False, log level: Level.ERROR



247 [main] INFO org.jooq.aR - 
                                      
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@  @@        @@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@        @@@@@@@@@@
@@@@@@@@@@@@@@@@  @@  @@    @@@@@@@@@@
@@@@@@@@@@  @@@@  @@  @@    @@@@@@@@@@
@@@@@@@@@@        @@        @@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@        @@        @@@@@@@@@@
@@@@@@@@@@    @@  @@  @@@@  @@@@@@@@@@
@@@@@@@@@@    @@  @@  @@@@  @@@@@@@@@@
@@@@@@@@@@        @@  @  @  @@@@@@@@@@
@@@@@@@@@@        @@        @@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@  @@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  Thank you for using jOOQ 3.14.16
                                      


{'queue max length, idx': (111, 4666), 'total number of states': 29145, 'number of traj-s': 574, 'number of validation traj-s': 111}


100%|███████████████████████████████████████| 1000/1000 [00:47<00:00, 21.22it/s]


{'logits': 0.7530734539031982, 'values': 0.5986373424530029, 'logits chosen': 0.5661764144897461, 'entropy loss': 0.5391933917999268, 'actor loss': 0.3875000476837158, 'total loss': 29.308507919311523, 'optimizers step': 0.7897698879241943, 'optimizers no step': 13.039979457855225, 'slice (x2)': 0, 'logsexp': 0, 'sample batch': 8.324571132659912, 'splitting': 7.892240524291992, 'padding': 6.2383856773376465}
verbose: False, log level: Level.ERROR



252 [main] INFO org.jooq.aR - 
                                      
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@  @@        @@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@        @@@@@@@@@@
@@@@@@@@@@@@@@@@  @@  @@    @@@@@@@@@@
@@@@@@@@@@  @@@@  @@  @@    @@@@@@@@@@
@@@@@@@@@@        @@        @@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@        @@        @@@@@@@@@@
@@@@@@@@@@    @@  @@  @@@@  @@@@@@@@@@
@@@@@@@@@@    @@  @@  @@@@  @@@@@@@@@@
@@@@@@@@@@        @@  @  @  @@@@@@@@@@
@@@@@@@@@@        @@        @@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@  @@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  Thank you for using jOOQ 3.14.16
                                      


In [None]:
# exit()

### Side Utils and commented

In [None]:
# def R_dif_max():
#     TrsBFS = Trajectories(path='../Data/BFS_dataset.json', eval_condition=(lambda x:False))
#     TrsNN = Trajectories(eval_condition=(lambda x:False))

#     Returns_BFS = TrsBFS.evaluate_data(factors=[1], eval_condition=(lambda x: True), verbose=False)['Returns']
#     Returns_NN = TrsNN.evaluate_data(factors=[1], eval_condition=(lambda x: True), verbose=False)['Returns']

#     Returns_dif = torch.Tensor(Returns_BFS) - torch.Tensor(Returns_NN)
#     max_idx = torch.argmax(Returns_dif)
#     max_dif = Returns_dif[max_idx]
#     assert TrsBFS.j_file['paths'][max_idx][2]==TrsNN.j_file['paths'][max_idx][2], 'wtf'
#     return TrsNN.j_file['paths'][max_idx][2], TrsBFS.j_file['paths'][max_idx][2], max_idx, max_dif, Returns_BFS[max_idx], Returns_NN[max_idx], len(Returns_dif)

# name_max, *a = R_dif_max()
# R_dif_max()

In [None]:
#@title Fit a tree

# Features, _, _, R, _ =  Trajectories(json_path).trs_tensors
# r_tree.fit(Features.to('cpu'), R.to('cpu'))
# R_prediction = r_tree.predict(Features.to('cpu'))
# print(f'leaves: {r_tree.get_n_leaves()}, number of states: {Trajectories(json_path).n_sarsa_pairs}, depth: {totalr_tree.get_depth()}')
# torch.mean((R.to('cpu') - torch.Tensor(R_prediction))**2)