In [1]:
import argparse
import os
import random
import time
from distutils.util import strtobool
import wandb
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from SFQenv import SFQ, max_sequence_length
from SFQ_calc import reward_calculation #, reward_calculation_slow , u_matrix
import multiprocessing as mp
from pprint import pprint

In [2]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [3]:
# reward_calculation_jit = jit(reward_calculation)

### Hyperparameter setting

In [4]:
# args = {
#     'seed':42,
#     'exp_name': 'Test',
#     'torch_deterministic':True,
#     'cuda':False,
#     'track': True,
#     'wandb_project_name': 'Test',
#     'wandb_entity': None,
#     'capture_video': False,
#     'env_id': "CartPole-v1",
#     'total_timesteps':500000,
#     'learning_rate': 2.5e-4,
#     'num_envs' : 4,
#     'num_steps': 125,
#     'anneal_lr': True,
#     'layer_size':64,
#     'value_layer_size': 64,
#     'gamma': 0.99,
#     'gae_lambda': 0.95,
#     'num_minibatches': 12,
#     'update_epochs': 5,
#     'norm_adv': True,
#     'clip_coef': 0.2,
#     'clip_vloss': True,
#     'ent_coef': 0.01,
#     'vf_coef': 0.5,
#     'max_grad_norm': 0.5,
#     'target_kl': None
    
# }
# args = dotdict(args)


In [5]:
default_config = {
    'torch_deterministic': True,
    'clip_coef': 0.2,
    'anneal_lr': True,
    'num_envs': 200,
    'num_minibatches': 4,
    'num_steps': 128,
    'max_grad_norm': 0.5,
    'clip_vloss' : True,
    'seed': 42,
    'env_id': 'SFQ',
    'track': True,
    'wandb_project_name': 'Test',
    'exp_name': 'hyperparam_search',
    'norm_adv': True,
    'target_kl': None,
    
}
default_config = dotdict(default_config)

### Sweep config

In [6]:
sweep_config = {
    'method': 'random',
    # 'early_terminate':{
    #     'type':'hyperband',
    #     'min_iter':1000,
    # }
    }

metric = {
    'name': "charts/episodic_return",
    'goal': 'maximize',
    # 'target':1
    }

sweep_config['metric'] = metric

In [7]:
parameters_dict = {
    'ent_coef': {
        'values':  [0.00001, 0.0001, 0.001, 0.01]# [0.0001]
    },
    'layer_size': {
        'values':  [64, 128, 256] #[256]
    },
    'vf_coef':{
        # 'values': [0.25,0.5,0.75] #[0.75]
        'distribution': 'q_uniform',
        'min': 0.25,
        'max': 0.75,
        'q': 0.05
    },
    'update_epochs':{
        # 'values': [4,5,7,10,12,15] #[7]
        'distribution': 'q_uniform',
        'min': 3,
        'max': 10,
        'q': 1
    },
    'gamma':{
        # 'values': [0.99,0.95,0.999,0.9] #[0.9]
        'distribution': 'q_uniform',
        'min': 0.9,
        'max': 0.99,
        'q': 0.01
    },
    'gae_lambda':{
        # 'values': [0.95,0.99,0.9] #[0.9]
        'distribution': 'q_uniform',
        'min': 0.9,
        'max': 0.99,
        'q': 0.01
    },
    # 'max_grad_norm' : {
    #     'values': [0.5]
    # },
    # 'num_envs': {
    #     'values':[4]
    # },
    # 'num_steps': {
    #     'value':128
    # },
    # 'anneal_lr':{
    #     'value': True
    # },
    # 'torch_deterministic':{
    #     'value' : False #True
    # },
    # 'clip_coef':{
    #     'value': 0.2
    # },
    #     'clip_vloss': {
    #     'value': True
    # }
}
parameters_dict.update({
    'learning_rate': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0,
        'max': 0.001
        # 'value': 2.3e-4
    },
    # 'batch_size': {
    #     'distribution': 'q_log_uniform_values',
    #     'q': 8,
    #     'min': 100,
    #     'max': 512
    # },
    'total_timesteps':{
        'distribution': 'q_uniform',
        'q': 50000,
        'min': 100000,
        'max': 150000
        # 'value':150000
    },
})

In [8]:
sweep_config['parameters']=parameters_dict

In [9]:
sweep_config

{'method': 'random',
 'metric': {'name': 'charts/episodic_return', 'goal': 'maximize'},
 'parameters': {'ent_coef': {'values': [1e-05, 0.0001, 0.001, 0.01]},
  'layer_size': {'values': [64, 128, 256]},
  'vf_coef': {'distribution': 'q_uniform',
   'min': 0.25,
   'max': 0.75,
   'q': 0.05},
  'update_epochs': {'distribution': 'q_uniform', 'min': 3, 'max': 10, 'q': 1},
  'gamma': {'distribution': 'q_uniform', 'min': 0.9, 'max': 0.99, 'q': 0.01},
  'gae_lambda': {'distribution': 'q_uniform',
   'min': 0.9,
   'max': 0.99,
   'q': 0.01},
  'learning_rate': {'distribution': 'uniform', 'min': 0, 'max': 0.001},
  'total_timesteps': {'distribution': 'q_uniform',
   'q': 50000,
   'min': 100000,
   'max': 150000}}}

In [10]:
sweep_id = wandb.sweep(sweep=sweep_config, project="SFQenv")

Create sweep with ID: z2cwk0n6
Sweep URL: https://wandb.ai/donalexs12/SFQenv/sweeps/z2cwk0n6


In [11]:
def make_env():
    def thunk():
        env = SFQ()
        env = gym.wrappers.RecordEpisodeStatistics(env)
        # env.seed(seed)
        return env

    return thunk

In [12]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


### Agent (neural network) class

In [13]:
class Agent(nn.Module):
    def __init__(self, envs, layer_size):
        super().__init__()
        self.base = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), layer_size)),
            nn.Tanh(),
            layer_init(nn.Linear(layer_size, layer_size)),
            nn.Tanh(),
            
        )
        self.lstm = nn.LSTM(layer_size, layer_size)
        for name, param in self.lstm.named_parameters():
            if "bias" in name:
                nn.init.constant_(param, 0)
            elif "weight" in name:
                nn.init.orthogonal_(param, 1.0)
        self.critic = layer_init(nn.Linear(layer_size, 1), std=1.0)
        self.actor = layer_init(nn.Linear(layer_size, envs.single_action_space.n), std=0.01)

    def get_states(self, x, lstm_state, done):
        hidden = self.base(x)
        
        batch_size = lstm_state[0].shape[1]
        hidden = hidden.reshape((-1, batch_size, self.lstm.input_size))
        done = done.reshape((-1, batch_size))
        new_hidden = []
        for h, d in zip(hidden, done):
            h, lstm_state = self.lstm(
                h.unsqueeze(0),
                (
                    (1.0 - d).view(1, -1, 1) * lstm_state[0],
                    (1.0 - d).view(1, -1, 1) * lstm_state[1],
                ),
            )
            new_hidden += [h]
        new_hidden = torch.flatten(torch.cat(new_hidden), 0, 1)
        return new_hidden, lstm_state

    def get_value(self, x, lstm_state, done):
        hidden, _ = self.get_states(x, lstm_state, done)
        return self.critic(hidden)

    def get_action_and_value(self, x, lstm_state,done,action=None):
        hidden, lstm_state = self.get_states(x, lstm_state, done)
        logits = self.actor(hidden)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden), lstm_state

In [14]:
run_name = f"{default_config.env_id}__{default_config.exp_name}__{default_config.seed}__{int(time.time())}"

### Env creation and seeding

In [15]:
random.seed(default_config.seed)
np.random.seed(default_config.seed)
torch.manual_seed(default_config.seed)
torch.backends.cudnn.deterministic = default_config.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
envs = gym.experimental.vector.SyncVectorEnv(
    [make_env() for i in range(default_config.num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [17]:
agent = None # will be declared later in the code

### Storage setup

In [18]:
obs = torch.zeros((default_config.num_steps, default_config.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((default_config.num_steps, default_config.num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((default_config.num_steps, default_config.num_envs)).to(device)
rewards = torch.zeros((default_config.num_steps, default_config.num_envs)).to(device)
dones = torch.zeros((default_config.num_steps, default_config.num_envs)).to(device)
values = torch.zeros((default_config.num_steps, default_config.num_envs)).to(device)

### Game start

>ПОПРОБОВАТЬ ВЫНЕСТИ КОНФИГУРИРУЕМЫЕ СВИПОМ ПАРАМЕТРЫ ЗА ПРЕДЕЛЫ ЦИКЛА

### Playing cycle

In [19]:
def train_PPO(config=default_config):
    with wandb.init(config=config,
                        sync_tensorboard=True,
                        project='SFQ_hyperparam'):

        writer = SummaryWriter(f"runs/{run_name}")
        writer.add_text(
            "hyperparameters",
            "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(config).items()])),
        )

        config = wandb.config
        
        config.batch_size = int(config.num_envs * config.num_steps)
        config.minibatch_size = int(config.batch_size // default_config.num_minibatches)
        agent = Agent(envs, config.layer_size).to(device)
        # agent = Agent(envs).to(device)

        global_step = 0
        start_time = time.time()
        
        next_done = torch.zeros(config.num_envs).to(device)
        next_obs = torch.Tensor(envs.reset()[0]).to(device)
        num_updates = config.total_timesteps // config.batch_size
        next_lstm_state = (
        torch.zeros(agent.lstm.num_layers, default_config.num_envs, agent.lstm.hidden_size).to(device),
        torch.zeros(agent.lstm.num_layers, default_config.num_envs, agent.lstm.hidden_size).to(device),)  
        # hidden and cell states (see https://youtu.be/8HyCNIVRbSU)
        optimizer = optim.Adam(agent.parameters(), lr=config.learning_rate, eps=1e-5)

        total_return = 0.9
        for update in range(1, num_updates + 1):
            initial_lstm_state = (next_lstm_state[0].clone(), next_lstm_state[1].clone())
            # Annealing the rate if instructed to do so.
            if default_config.anneal_lr:
                frac = 1.0 - (update - 1.0) / num_updates
                lrnow = frac * config.learning_rate
                optimizer.param_groups[0]["lr"] = lrnow
        
            for step in range(0, config.num_steps):
                global_step += 1 * config.num_envs
                obs[step] = torch.Tensor(next_obs).to(device)
                dones[step] = next_done
        
                # ALGO LOGIC: action logic
                with torch.no_grad():
                    action, logprob, _, value, next_lstm_state = agent.get_action_and_value(next_obs,
                                                                                           next_lstm_state,
                                                                                           next_done)
                    values[step] = value.flatten()
                actions[step] = action
                logprobs[step] = logprob
        
                # TRY NOT TO MODIFY: execute the game and log data.
                next_obs, reward, done,terminated, info = envs.step(action.cpu().numpy())
                next_done = np.logical_or(done,terminated)
                rewards[step] = torch.tensor(reward).to(device).view(-1)
                next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(next_done).to(device)

                
                if 'final_info' in info.keys():
                    for item in info['final_info']:
                        if item is not None:
                            #print(f"global_step={global_step}, episodic_return={item['episode']['r']}, episode_length = {max_sequence_length}")
                            writer.add_scalar("charts/fidelity", item['fidelity'], global_step)
                            current_return = item['fidelity']
                            writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                            writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                            if current_return > total_return:
                                total_return = current_return
                                # torch.save(agent.state_dict(),f'model_fid{current_return:.3f}.pt')
                            break
               
                # bootstrap value if not done
            with torch.no_grad():
                next_value = agent.get_value(next_obs,
                                            next_lstm_state,
                                            next_done).reshape(1, -1)
                advantages = torch.zeros_like(rewards).to(device)
                lastgaelam = 0
                for t in reversed(range(config.num_steps)):
                    if t == config.num_steps - 1:
                        nextnonterminal = 1.0 - next_done
                        nextvalues = next_value
                    else:
                        nextnonterminal = 1.0 - dones[t + 1]
                        nextvalues = values[t + 1]
                    delta = rewards[t] + config.gamma * nextvalues * nextnonterminal - values[t]
                    advantages[t] = lastgaelam = delta + config.gamma * config.gae_lambda * nextnonterminal * lastgaelam
                returns = advantages + values
    
            # flatten the batch
            b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
            b_logprobs = logprobs.reshape(-1)
            b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
            b_dones = dones.reshape(-1)
            b_advantages = advantages.reshape(-1)
            b_returns = returns.reshape(-1)
            b_values = values.reshape(-1)
    
            # Optimizing the policy and value network
            assert default_config.num_envs % default_config.num_minibatches == 0
            envsperbatch = default_config.num_envs // default_config.num_minibatches
            envinds = np.arange(default_config.num_envs)
            flatinds = np.arange(config.batch_size).reshape(config.num_steps, default_config.num_envs)
            # b_inds = np.arange(config.batch_size)
            clipfracs = []
            for epoch in range(config.update_epochs):
                np.random.shuffle(envinds)
                for start in range(0, default_config.num_envs, envsperbatch):
                    end = start + envsperbatch
                    mbenvinds = envinds[start:end]
                    mb_inds = flatinds[:, mbenvinds].ravel()  # be really careful about the index
                    
                    _, newlogprob, entropy, newvalue, _ = agent.get_action_and_value(
                        b_obs[mb_inds], 
                        (initial_lstm_state[0][:, mbenvinds],initial_lstm_state[1][:,mbenvinds]),
                        b_dones[mb_inds],
                        b_actions.long()[mb_inds]
                    )
                    logratio = newlogprob - b_logprobs[mb_inds]
                    ratio = logratio.exp()
    
                    with torch.no_grad():
                        # calculate approx_kl http://joschu.net/blog/kl-approx.html
                        old_approx_kl = (-logratio).mean()
                        approx_kl = ((ratio - 1) - logratio).mean()
                        clipfracs += [((ratio - 1.0).abs() > config.clip_coef).float().mean().item()]
    
                    mb_advantages = b_advantages[mb_inds]
                    if config.norm_adv:
                        mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)
    
                    # Policy loss
                    pg_loss1 = -mb_advantages * ratio
                    pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - config.clip_coef, 1 + config.clip_coef)
                    pg_loss = torch.max(pg_loss1, pg_loss2).mean()
    
                    # Value loss
                    newvalue = newvalue.view(-1)
                    if config.clip_vloss:
                        v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                        v_clipped = b_values[mb_inds] + torch.clamp(
                            newvalue - b_values[mb_inds],
                            -config.clip_coef,
                            config.clip_coef,
                        )
                        v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                        v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                        v_loss = 0.5 * v_loss_max.mean()
                    else:
                        v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
    
                    entropy_loss = entropy.mean()
                    loss = pg_loss - config.ent_coef * entropy_loss + v_loss * config.vf_coef
    
                    optimizer.zero_grad()
                    loss.backward()
                    nn.utils.clip_grad_norm_(agent.parameters(), config.max_grad_norm)
                    optimizer.step()
    
                if config.target_kl is not None:
                    if approx_kl > config.target_kl:
                        break
        
            y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
            var_y = np.var(y_true)
            explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
    
            # TRY NOT TO MODIFY: record rewards for plotting purposes
            writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
            writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
            writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
            writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
            writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
            writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
            writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
            writer.add_scalar("losses/explained_variance", explained_var, global_step)
            #print("SPS:", int(global_step / (time.time() - start_time)))
            writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
    envs.close()
    writer.close()

In [None]:
wandb.agent(sweep_id, train_PPO, count=10)

[34m[1mwandb[0m: Agent Starting Run: 9zbfu1fu with config:
[34m[1mwandb[0m: 	ent_coef: 0.001
[34m[1mwandb[0m: 	gae_lambda: 0.92
[34m[1mwandb[0m: 	gamma: 0.93
[34m[1mwandb[0m: 	layer_size: 256
[34m[1mwandb[0m: 	learning_rate: 0.0006576864420961041
[34m[1mwandb[0m: 	total_timesteps: 150000
[34m[1mwandb[0m: 	update_epochs: 5
[34m[1mwandb[0m: 	vf_coef: 0.5
[34m[1mwandb[0m: Currently logged in as: [33mdonalexs12[0m. Use [1m`wandb login --relogin`[0m to force relogin


Traceback (most recent call last):
  File "/tmp/ipykernel_30392/1133302147.py", line 65, in train_PPO
    writer.add_scalar("charts/fidelity", item['fidelity'], global_step)
  File "/home/wolvenanthros/anaconda3/envs/SFQ/lib/python3.10/site-packages/torch/utils/tensorboard/writer.py", line 389, in add_scalar
    from caffe2.python import workspace
ModuleNotFoundError: No module named 'caffe2'


VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
global_step,▁

0,1
global_step,0


Run 9zbfu1fu errored: ModuleNotFoundError("No module named 'caffe2'")
[34m[1mwandb[0m: [32m[41mERROR[0m Run 9zbfu1fu errored: ModuleNotFoundError("No module named 'caffe2'")
[34m[1mwandb[0m: Agent Starting Run: uvmd159y with config:
[34m[1mwandb[0m: 	ent_coef: 0.0001
[34m[1mwandb[0m: 	gae_lambda: 0.91
[34m[1mwandb[0m: 	gamma: 0.95
[34m[1mwandb[0m: 	layer_size: 64
[34m[1mwandb[0m: 	learning_rate: 0.00019099562787261348
[34m[1mwandb[0m: 	total_timesteps: 100000
[34m[1mwandb[0m: 	update_epochs: 9
[34m[1mwandb[0m: 	vf_coef: 0.5


class SelfImitation():
    def __init__(self,model_obs,model_values,model_entropy,value,neg_log_prob,action_space
                ,reward,obs,n_envs,batch_size,n_updates,clip,w_value,w_entropy,
                max_steps,gamma,max_nlogp,min_batch_size,stack,alpha,beta):
            self.model_ob = model_obs
            self.model_vf = model_values
            self.model_entropy = model_entropy
            self.fn_value = value
            self.fn_neg_log_prob = neg_log_prob
            self.fn_reward = reward
            self.fn_obs = obs
    
            self.beta = beta
            self.buffer = PrioritizedReplayBuffer(max_steps, alpha)
            self.n_env = n_env
            self.batch_size = batch_size
            self.n_update = n_update
            self.clip = clip
            self.w_loss = 1.0
            self.w_value = w_value
            self.w_entropy = w_entropy
            self.max_steps = max_steps
            self.gamma = gamma
            self.max_nlogp = max_nlogp
            self.min_batch_size = min_batch_size
    
            self.stack = stack
            self.train_count = 0
            self.update_count = 0
            self.total_steps = []
            self.total_rewards = []
            self.running_episodes = [[] for _ in range(n_env)]

            self.build_loss_op()

        def build_loss_op(self, params, optim, lr, max_grad_norm=0.5):
            mask = torch.where(self.R - torch.squeeze(self.model_vf) > 0.0,
                               torch.ones_like(self.R),
                              torch.zeros_like(self.R))
            self.num_valid_samples = torch.sum(mask)
            self.num_samples = torch.max(self.num_valid_samples, self.min_batch_size)
            