In [97]:
from env_hiv import  *
env = HIVPatient(clipping = True)
def rew(state,action, env) : 
    return-(
                env.Q * state[4]
                + env.R1 * action[0] ** 2
                + env.R2 * action[1] ** 2
                - env.S * state[5]
            )
#low_r, high_r = rew(env.lower, [1,1],env),rew(env.upper, [1,1],env)

import torch
try : 
    torch.multiprocessing.set_start_method('spawn')
except : 
    pass
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer
class policyNetwork(nn.Module):
    def __init__(self, env,config):
        super().__init__()
        #self.upper, self.lower =env.upper, env.lower
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        state_dim = env.observation_space.shape[0]
        n_action = env.action_space.n
        neurons_list  = [state_dim, config['hsize'],config['hsize'], n_action]
        self.model  =nn.Sequential(
            *[
                nn.Sequential(
                    layer_init(nn.Linear(neurons_list[i], neurons_list[i+1])),
                    nn.LeakyReLU()
                ) 
                for i in range(len(neurons_list)-1)
            ]
        ).to(self.device)

    def forward(self, x):
        if torch.Tensor(x).dim() == 1:
            x = x.unsqueeze(dim=0)
        #x = (x-self.lower)/(self.upper-self.lower)
        action_scores = self.model(torch.Tensor(x).to(self.device, torch.float32))#.log())
        return F.softmax(action_scores,dim=1)

    def sample_action(self, x):
        probabilities = self.forward(x)
        action_distribution = Categorical(probabilities)
        return action_distribution.sample().item()

    def log_prob(self, x, a):
        probabilities = self.forward(x)
        action_distribution = Categorical(probabilities)
        return action_distribution.log_prob(a)

class valueNetwork(nn.Module):
    def __init__(self, env,config):
        super().__init__()
        #self.upper, self.lower =env.upper, env.lower
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        state_dim = env.observation_space.shape[0]
        n_action = env.action_space.n
        neurons_list  = [state_dim, config['hsize'],config['hsize'], 1]
        self.model  =nn.Sequential(
            *[
                nn.Sequential(
                    layer_init(nn.Linear(neurons_list[i], neurons_list[i+1])),
                    nn.LeakyReLU()
                ) 
                for i in range(len(neurons_list)-1)
            ]
        ).to(self.device)

    def forward(self, x):
        if torch.Tensor(x).dim() == 1:
            x = x.unsqueeze(dim=0)
        #x = (x-self.lower)/(self.upper-self.lower)
        
        return self.model(torch.Tensor(x).to(self.device, torch.float32))
        
    
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange

class a2c_agent:
    def __init__(self, config, policy_network, value_network):
        self.config = config
        self.device = "cuda" if next(policy_network.parameters()).is_cuda else "cpu"
        self.scalar_dtype = next(policy_network.parameters()).dtype
        self.policy = policy_network.to(self.device)
        self.value = value_network.to(self.device)
        self.gamma = config['gamma'] if 'gamma' in config.keys() else 0.99
        lr = config['learning_rate'] if 'learning_rate' in config.keys() else 0.001
        self.optimizer = torch.optim.AdamW(list(self.policy.parameters()) + list(self.value.parameters()),lr=lr)#, eps = 1e-5, weight_decay= 1e-5)
        self.nb_episodes = config['nb_episodes'] if 'nb_episodes' in config.keys() else 1
        self.entropy_coefficient = config['entropy_coefficient'] if 'entropy_coefficient' in config.keys() else 0.001

    def sample_action(self, x):
        probabilities = self.policy(x)
        action_distribution = Categorical(probabilities)
        action = action_distribution.sample()
        log_prob = action_distribution.log_prob(action)
        entropy = action_distribution.entropy()
        return action, log_prob, entropy
    def one_gradient_step(self, envcreat):
        env = gym.vector.AsyncVectorEnv([envcreat for i in range(self.nb_episodes)])
        episodes_sum_of_rewards = []
        log_probs = [[] for i in range(env.observation_space.shape[0])]
        returns = []
        
        values = [[] for i in range(env.observation_space.shape[0])]
        entropies = [[] for i in range(env.observation_space.shape[0])]
        rewards = [[] for i in range(env.observation_space.shape[0])]
        episode_cum_reward = 0
        x,_ = env.reset()
        for t in range(200) : 
            a, log_prob, entropy = self.sample_action(x)
            y,r,d,trunc,infos = env.step(a.cpu().detach().numpy())
            #r = (r-low_r)/(high_r-low_r)
            V=self.value(torch.from_numpy(x).to(torch.float32))
            for i in range(self.nb_episodes): 
                log_probs[i].append(log_prob[i])
                rewards[i].append(r[i])
                values[i].append(V[i])
                entropies[i].append(entropy[i])
                episode_cum_reward += r
            x=y
            if t == 199 :
                V=self.value(torch.from_numpy(x).to(torch.float32))
                for i in range(self.nb_episodes): 
                    log_probs[i].append(log_prob[i])
                    rewards[i].append(r[i])
                    values[i].append(V[i])
                    entropies[i].append(entropy[i])
                    episode_cum_reward += r
        
        for i in range(self.nb_episodes): 
            # compute returns-to-go
            new_returns = []
            G_t = 0
            for r in reversed(rewards[i]):
                G_t = r + self.gamma * G_t
                new_returns.append(G_t)
            new_returns = list(reversed(new_returns))
            returns.extend(new_returns)
            episodes_sum_of_rewards.append(episode_cum_reward)
        # make loss
        returns = torch.Tensor(returns).to(self.device)
        values = torch.cat([torch.stack(u) for u in values]).to(self.device)
        log_probs = torch.cat([torch.stack(u) for u in log_probs]).to(self.device)
        entropies = torch.cat([torch.stack(u) for u in entropies]).to(self.device)
        advantages = returns - values
        
        #advantages = torch.cat([torch.stack(u) for u in values]).to(self.device)
        #print(advantages.shape, log_probs.shape,values.shape,entropies.shape, returns.shape)
        pg_loss = -(advantages.detach() * log_probs).mean()
        entropy_loss = -entropies.mean()
        critic_loss = advantages.pow(2).mean()
        loss = pg_loss + critic_loss + self.entropy_coefficient * entropy_loss
        # gradient step
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
        torch.nn.utils.clip_grad_norm_(self.value.parameters(), 0.5)
        self.optimizer.step()
        return np.mean(episode_cum_reward)/self.nb_episodes, loss.item(),critic_loss, pg_loss

    def train(self, env, nb_rollouts):
        avg_sum_rewards = []
        pbar = trange(nb_rollouts)
        for ep in pbar:
            avg ,loss, critic_loss,pg_loss= self.one_gradient_step(env)
            #avg = avg*(high_r-low_r)+low_r
            pbar.set_postfix(avg_return = avg,loss=loss, critic_loss =critic_loss.item(), pg_loss = pg_loss.item())
            avg_sum_rewards.append(avg)
            frac = 1- (ep-1)/nb_rollouts
            for g in self.optimizer.param_groups:
                g['lr'] =frac * self.config['learning_rate']

        return avg_sum_rewards


In [98]:

import gymnasium as gym
import matplotlib.pyplot as plt
from gymnasium.wrappers import TimeLimit
env = gym.make("LunarLander-v2", render_mode="rgb_array")
#env = gym.make('CartPole-v1', render_mode="rgb_array")
#env = gym.make('Acrobot', render_mode="rgb_array")
#env= HIVPatient()
env = TimeLimit(env , 200)
config = {'gamma': 0.9,
          'learning_rate': 5e-2,
          'nb_episodes': 20,
          'entropy_coefficient': 1e-3,
          "hsize":128
         }

pi = policyNetwork(env,config)
V  = valueNetwork(env,config)
agent = a2c_agent(config, pi, V)
returns = agent.train(lambda : env,300)
plt.plot(returns)

  1%|          | 3/300 [00:05<08:36,  1.74s/it, avg_return=-366, critic_loss=753, loss=741, pg_loss=-12.2]          Traceback (most recent call last):
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "<string>", line 1, in <module>
  File "/home/tordjx/miniconda3/envs/mujoco_py/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
  File "/home/tordjx/miniconda3/envs/mujoco_py/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/home/tordjx/miniconda3/envs/mujoco_py/lib/python3.8/multiprocessing/spawn.py", line 124, in _main
    preparation_data = reduction.pickle.load(from_parent)
EOFError: Ran out of input
    exitcode = _main(fd, parent_sentinel)
  File "/home/tordjx/miniconda3/envs/mujoco_py/lib/python3.8/multiprocessing/spawn.py", line 124, in _main
    preparation_data = reduction.pickle.load(from_parent)
EOFError: Ran out of input
Process Worker<AsyncVectorEnv>-15:
Process W

KeyboardInterrupt: 

In [67]:
env.observation_space.high

array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32)

from ray import train, tune
from ray.tune.search.optuna import OptunaSearch
from gymnasium.wrappers import TimeLimit


def objective(config):
    env = HIVPatient()
    env = TimeLimit(env, 200)
    
    pi = policyNetwork(env, config)
    V = valueNetwork(env, config)
    agent = a2c_agent(config, pi, V)
    while True : 
            ret ,loss, critic_loss,pg_loss= agent.one_gradient_step(lambda : env)
            ret = ret*(high_r-low_r)+low_r
            train.report({"return":ret})
search_space = {
        "gamma": 0.95,
        "learning_rate": tune.loguniform(1e-7, 1e-3),
        "nb_episodes": 5,
        "entropy_coefficient": tune.loguniform(1e-5, 1),
        "hsize": 256
    }
algo = OptunaSearch()  # ②
from ray.train import ScalingConfig

objective_ressources = tune.with_resources(objective, {"gpu":2})
tuner = tune.Tuner(  # ③
    objective_ressources,
    tune_config=tune.TuneConfig(
        metric="return",
        mode="max",
        search_alg=algo,
        num_samples=100,
    ),
    run_config=train.RunConfig(
        stop={"training_iteration": 50},
    ),
    param_space=search_space,
)
results = tuner.fit()
print("Best config is:", results.get_best_result().config)