In [1]:
import argparse
import os
import random
import time
from distutils.util import strtobool
import wandb
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

In [2]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

### Hyperparameter setting

In [3]:
args = {
    'seed':42,
    'exp_name': 'Test',
    'torch_deterministic':True,
    'cuda':False,
    'track': True,
    'wandb_project_name': 'Test',
    'wandb_entity': None,
    'capture_video': False,
    'env_id': "CartPole-v1",
    'total_timesteps':int(5e3),
    'learning_rate': 2.5e-4,
    'num_envs' : 4,
    'num_steps': 128,
    'anneal_lr': True,
    'gamma': 0.99,
    'gae_lambda': 0.95,
    'num_minibatches': 4,
    'update_epochs': 4,
    'norm_adv': True,
    'clip_coef': 0.2,
    'clip_vloss': True,
    'ent_coef': 0.01,
    'vf_coef': 0.5,
    'max_grad_norm': 0.5,
    'target_kl': None
    
}
args = dotdict(args)

args.batch_size = int(args.num_envs * args.num_steps)
args.minibatch_size = int(args.batch_size // args.num_minibatches)

print(args.minibatch_size)

128


### Sweep config

In [4]:
sweep_config = {
    'method': 'random',
    'early_terminate':{
        'type':'hyperband',
        'eta':2,
        'min_iter':90,
        's':3
    }
    }

metric = {
    'name': 'charts/episodic_return',
    'goal': 'maximize',
    'target':40
    }

sweep_config['metric'] = metric

In [5]:
parameters_dict = {
    'ent_coef': {
        'values': [0.00001, 0.0001, 0.001, 0.01]
    },
    # 'policy_layer_size': {
    #     'values': [64, 128, 256]
    # },
    # 'value_layer_size': {
    #     'values': [64, 128, 256]
    # },
    'vf_coef':{
        'values':[0.25,0.5,0.75]
    },
    'n_epochs':{
        'values':[5,10,15]
    }
}
parameters_dict.update({
    'learning_rate': {
        # a flat distribution between 0 and 0.1
        'distribution': 'uniform',
        'min': 0,
        'max': 0.1
    },
    'batch_size': {
        'distribution': 'q_log_uniform_values',
        'q': 8,
        'min': 100,
        'max': 512
    }
})

In [6]:
sweep_config['parameters']=parameters_dict

In [7]:
sweep_config

{'method': 'random',
 'early_terminate': {'type': 'hyperband', 'eta': 2, 'min_iter': 90, 's': 3},
 'metric': {'name': 'charts/episodic_return',
  'goal': 'maximize',
  'target': 40},
 'parameters': {'ent_coef': {'values': [1e-05, 0.0001, 0.001, 0.01]},
  'vf_coef': {'values': [0.25, 0.5, 0.75]},
  'n_epochs': {'values': [5, 10, 15]},
  'learning_rate': {'distribution': 'uniform', 'min': 0, 'max': 0.1},
  'batch_size': {'distribution': 'q_log_uniform_values',
   'q': 8,
   'min': 100,
   'max': 512}}}

In [8]:
sweep_id = wandb.sweep(sweep_config, project="Test")

Create sweep with ID: y7yt9bgv
Sweep URL: https://wandb.ai/donalexs12/test/sweeps/y7yt9bgv


In [9]:
def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        if capture_video and idx == 0:
            env = gym.make(env_id)
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        else:
            env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        # env.seed(seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env

    return thunk

In [10]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


In [11]:
class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
        )

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)

In [12]:
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"

In [13]:
# if args.track:
#     import wandb

#     wandb.init(
#         project=args.wandb_project_name,
#         entity=args.wandb_entity,
#         sync_tensorboard=True,
#         config=vars(args),
#         name=run_name,
#         monitor_gym=True,
#         save_code=True,
#     )



### Env creation and seeding

In [14]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

In [15]:
envs = gym.vector.SyncVectorEnv(
    [make_env(args.env_id, idx=i, capture_video=args.capture_video, run_name=run_name, seed=args.seed) for i in range(args.num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"

In [16]:
agent = Agent(envs).to(device)

### Storage setup

In [17]:
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)

### Game start

>ПОПРОБОВАТЬ ВЫНЕСТИ КОНФИГУРИРУЕМЫЕ СВИПОМ ПАРАМЕТРЫ ЗА ПРЕДЕЛЫ ЦИКЛА

### Playing cycle

In [18]:
def train_PPO(config=None):
    with wandb.init(config=config,
                        sync_tensorboard=True,
                        project='SFQ_hyperparam'):

        writer = SummaryWriter(f"runs/{run_name}")
        writer.add_text(
            "hyperparameters",
            "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
        )
        
        config = wandb.config
        
        args.batch_size = config.batch_size
        args.minibatch_size = int(args.batch_size // args.num_minibatches)
        print(args.batch_size)
        global_step = 0
        start_time = time.time()
        next_obs = torch.Tensor(envs.reset()[0]).to(device)
        next_done = torch.zeros(args.num_envs).to(device)
        num_updates = args.total_timesteps // args.batch_size
        optimizer = optim.Adam(agent.parameters(), lr=config.learning_rate, eps=1e-5)
        
        for update in range(1, num_updates + 1):
            # Annealing the rate if instructed to do so.
            if args.anneal_lr:
                frac = 1.0 - (update - 1.0) / num_updates
                lrnow = frac * args.learning_rate
                optimizer.param_groups[0]["lr"] = lrnow
        
            for step in range(0, args.num_steps):
                global_step += 1 * args.num_envs
                obs[step] = torch.Tensor(next_obs).to(device)
                dones[step] = next_done
        
                # ALGO LOGIC: action logic
                with torch.no_grad():
                    action, logprob, _, value = agent.get_action_and_value(next_obs)
                    values[step] = value.flatten()
                actions[step] = action
                logprobs[step] = logprob
        
                # TRY NOT TO MODIFY: execute the game and log data.
                next_obs, reward, done,terminated, info = envs.step(action.cpu().numpy())
                rewards[step] = torch.tensor(reward).to(device).view(-1)
                next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
        
                if 'final_info' in info.keys():
                    for item in info['final_info']:
                        if item is not None:
                            print(f"global_step={global_step}, episodic_return={item['episode']['r']}, episode_length = {item['episode']['l']}")
                            writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                            writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                            break
               
                # bootstrap value if not done
                with torch.no_grad():
                    next_value = agent.get_value(next_obs).reshape(1, -1)
                    advantages = torch.zeros_like(rewards).to(device)
                    lastgaelam = 0
                    for t in reversed(range(args.num_steps)):
                        if t == args.num_steps - 1:
                            nextnonterminal = 1.0 - next_done
                            nextvalues = next_value
                        else:
                            nextnonterminal = 1.0 - dones[t + 1]
                            nextvalues = values[t + 1]
                        delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
                        advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
                    returns = advantages + values
        
                # flatten the batch
                b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
                b_logprobs = logprobs.reshape(-1)
                b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
                b_advantages = advantages.reshape(-1)
                b_returns = returns.reshape(-1)
                b_values = values.reshape(-1)
        
                # Optimizing the policy and value network
                b_inds = np.arange(args.batch_size)
                clipfracs = []
                for epoch in range(args.update_epochs):
                    np.random.shuffle(b_inds)
                    for start in range(0, args.batch_size, args.minibatch_size):
                        end = start + args.minibatch_size
                        mb_inds = b_inds[start:end]
        
                        _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
                        logratio = newlogprob - b_logprobs[mb_inds]
                        ratio = logratio.exp()
        
                        with torch.no_grad():
                            # calculate approx_kl http://joschu.net/blog/kl-approx.html
                            old_approx_kl = (-logratio).mean()
                            approx_kl = ((ratio - 1) - logratio).mean()
                            clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
        
                        mb_advantages = b_advantages[mb_inds]
                        if args.norm_adv:
                            mb_advantages = (mb_advantages - mb_advantages.mean()) / (mb_advantages.std() + 1e-8)
        
                        # Policy loss
                        pg_loss1 = -mb_advantages * ratio
                        pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - args.clip_coef, 1 + args.clip_coef)
                        pg_loss = torch.max(pg_loss1, pg_loss2).mean()
        
                        # Value loss
                        newvalue = newvalue.view(-1)
                        if args.clip_vloss:
                            v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
                            v_clipped = b_values[mb_inds] + torch.clamp(
                                newvalue - b_values[mb_inds],
                                -args.clip_coef,
                                args.clip_coef,
                            )
                            v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
                            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                            v_loss = 0.5 * v_loss_max.mean()
                        else:
                            v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
        
                        entropy_loss = entropy.mean()
                        loss = pg_loss - config.ent_coef * entropy_loss + v_loss * config.vf_coef
        
                        optimizer.zero_grad()
                        loss.backward()
                        nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
                        optimizer.step()
        
                    if args.target_kl is not None:
                        if approx_kl > args.target_kl:
                            break
        
                y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
                var_y = np.var(y_true)
                explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
        
                # TRY NOT TO MODIFY: record rewards for plotting purposes
                writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
                writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
                writer.add_scalar("losses/policy_loss", pg_loss.item(), global_step)
                writer.add_scalar("losses/entropy", entropy_loss.item(), global_step)
                writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
                writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
                writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
                writer.add_scalar("losses/explained_variance", explained_var, global_step)
                #print("SPS:", int(global_step / (time.time() - start_time)))
                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
    envs.close()
    writer.close()

In [19]:
wandb.agent(sweep_id, train_PPO, count=5)

[34m[1mwandb[0m: Agent Starting Run: gc1omplg with config:
[34m[1mwandb[0m: 	batch_size: 264
[34m[1mwandb[0m: 	ent_coef: 0.01
[34m[1mwandb[0m: 	learning_rate: 0.07505805831385604
[34m[1mwandb[0m: 	n_epochs: 10
[34m[1mwandb[0m: 	vf_coef: 0.5
[34m[1mwandb[0m: Currently logged in as: [33mdonalexs12[0m. Use [1m`wandb login --relogin`[0m to force relogin


264
global_step=44, episodic_return=[11.], episode_length = [11]
global_step=56, episodic_return=[14.], episode_length = [14]
global_step=68, episodic_return=[17.], episode_length = [17]
global_step=72, episodic_return=[18.], episode_length = [18]
global_step=88, episodic_return=[11.], episode_length = [11]
global_step=100, episodic_return=[11.], episode_length = [11]
global_step=120, episodic_return=[12.], episode_length = [12]
global_step=132, episodic_return=[16.], episode_length = [16]
global_step=144, episodic_return=[11.], episode_length = [11]
global_step=148, episodic_return=[15.], episode_length = [15]
global_step=192, episodic_return=[15.], episode_length = [15]
global_step=196, episodic_return=[12.], episode_length = [12]
global_step=232, episodic_return=[28.], episode_length = [28]
global_step=240, episodic_return=[24.], episode_length = [24]
global_step=256, episodic_return=[16.], episode_length = [16]
global_step=288, episodic_return=[14.], episode_length = [14]
global_st

0,1
charts/SPS,▃▃▁▂▂▂▂▂▃▃▄▄▄▄▄▃▃▃▃▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇█
charts/episodic_length,▁▁▁▁▂▁▁▁▁▁▂▁▁▂▄▃▃▃▅▄▄▆▇▄▃▆▂▄▄▄▄▄▅▆▅▅▅▆██
charts/episodic_return,▁▁▁▁▂▁▁▁▁▁▂▁▁▂▄▃▃▃▅▄▄▆▇▄▃▆▂▄▄▄▄▄▅▆▅▅▅▆██
charts/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
losses/approx_kl,█▁▂▂▂▁▂▂▄▂▁▁▂▁▁▂▁▂▂▁▁▂▂▁▁▂▁▁▂▂▁▁▂▁▁▁▁▂▁▁
losses/clipfrac,█▂▃▄▄▂▅▂▄▃▂▂▄▃▂▂▂▃▃▂▂▃▃▂▂▂▂▂▃▂▂▂▂▂▂▂▁▃▁▁
losses/entropy,███▇▇▆▄▅▄▄▄▄▃▃▄▅▆▅▅▄▅▄▅▅▄▅▃▅▄▃▃▃▃▂▁▃▄▃▃▃
losses/explained_variance,▂▂▁▃▄▄▄▁▂▂▅▆▅▅▅▆▅▄▄▅▅▄▆▆▇▆█▇▇▇█▇▇▇▇▇▅▇▆█
losses/old_approx_kl,█▂▂▂▂▂▁▂▂▂▁▂▁▂▂▂▂▂▂▂▂▂▂▂▁▂▂▂▂▂▂▁▂▂▁▂▁▂▂▂

0,1
charts/SPS,65.0
charts/episodic_length,183.0
charts/episodic_return,183.0
charts/learning_rate,1e-05
global_step,9216.0
losses/approx_kl,0.00317
losses/clipfrac,0.06913
losses/entropy,0.54428
losses/explained_variance,0.97032
losses/old_approx_kl,-0.00316


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [1]:
class SelfImitation():
    def __init__(self,model_obs,model_values,model_entropy,value,neg_log_prob,action_space
                ,reward,obs,n_envs,batch_size,n_updates,clip,w_value,w_entropy,
                max_steps,gamma,max_nlogp,min_batch_size,stack,alpha,beta):
            self.model_ob = model_obs
            self.model_vf = model_values
            self.model_entropy = model_entropy
            self.fn_value = value
            self.fn_neg_log_prob = neg_log_prob
            self.fn_reward = reward
            self.fn_obs = obs
    
            self.beta = beta
            self.buffer = PrioritizedReplayBuffer(max_steps, alpha)
            self.n_env = n_env
            self.batch_size = batch_size
            self.n_update = n_update
            self.clip = clip
            self.w_loss = 1.0
            self.w_value = w_value
            self.w_entropy = w_entropy
            self.max_steps = max_steps
            self.gamma = gamma
            self.max_nlogp = max_nlogp
            self.min_batch_size = min_batch_size
    
            self.stack = stack
            self.train_count = 0
            self.update_count = 0
            self.total_steps = []
            self.total_rewards = []
            self.running_episodes = [[] for _ in range(n_env)]

            self.build_loss_op()

        def build_loss_op(self, params, optim, lr, max_grad_norm=0.5):
            mask = torch.where(self.R - torch.squeeze(self.model_vf) > 0.0,
                               torch.ones_like(self.R),
                              torch.zeros_like(self.R))
            self.num_valid_samples = torch.sum(mask)
            self.num_samples = torch.max(self.num_valid_samples, self.min_batch_size)
            