In [1]:
##classes
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
from gymnasium.wrappers import TimeLimit
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer
hsize = 128
class policyNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        state_dim = env.observation_space.shape[1]
        n_action = env.action_space.nvec[0]
        self.fc1 = layer_init(nn.Linear(state_dim, hsize)).to(self.device)
        #self.fc2 = layer_init(nn.Linear(hsize, hsize)).to(self.device)
        self.fc3 = layer_init(nn.Linear(hsize, n_action)).to(self.device)
        self.dropout =nn.Dropout(p=0).to(self.device)
    def forward(self, x):
        if x.dim() == 1:
            x = x.unsqueeze(dim=0)
        x = F.relu(self.fc1(x.to(self.device)))
        x= self.dropout(x)
        #x = F.relu(self.fc2(x))
        action_scores = self.fc3(x)
        return F.softmax(action_scores,dim=1).cpu()

    def sample_action(self, x):
        probabilities = self.forward(x)
        action_distribution = Categorical(probabilities)
        return action_distribution.sample().item()

    def log_prob(self, x, a):
        probabilities = self.forward(x)
        action_distribution = Categorical(probabilities)
        return action_distribution.log_prob(a)
class valueNetwork(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        state_dim = env.observation_space.shape[1]
        self.fc1 = layer_init(nn.Linear(state_dim, hsize)).to(self.device)
        #self.fc2 = layer_init(nn.Linear(hsize, hsize)).to(self.device)
        self.fc3 = layer_init(nn.Linear(hsize, 1)).to(self.device)
        self.dropout =nn.Dropout(p=0).to(self.device)
    def forward(self, x):
        if x.dim() == 1:
            x = x.unsqueeze(dim=0)
        x = F.relu(self.fc1(x.to(self.device)))
        #x = F.relu(self.fc2(x))
        x= self.dropout(x)
        return self.fc3(x).cpu()
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange
from torch.utils.tensorboard import SummaryWriter
maxiter =200
class a2c_agent:
    def __init__(self, config, policy_network,value_network):
        self.device = "cuda" if next(policy_network.parameters()).is_cuda else "cpu"
        self.scalar_dtype = next(policy_network.parameters()).dtype
        self.policy = policy_network
        self.value = value_network
        self.gamma = config['gamma'] if 'gamma' in config.keys() else 0.99
        lr = config['learning_rate'] if 'learning_rate' in config.keys() else 0.001
        self.optimizer = torch.optim.Adam(list(self.policy.parameters()),lr=lr)
        self.entropy_coefficient = config['entropy_coefficient'] if 'entropy_coefficient' in config.keys() else 0.001

    def sample_action(self, x):
        probabilities = self.policy(torch.as_tensor(x))
        action_distribution = Categorical(probabilities)
        action = action_distribution.sample()
        log_prob = action_distribution.log_prob(action)
        entropy = action_distribution.entropy()
        return action, log_prob, entropy
    
    def one_gradient_step(self, env):
        # run trajectories until done
        episodes_sum_of_rewards = []
        log_probs = [[] for i in range(env.observation_space.shape[0])]
        returns = []
        x,_ = env.reset()
        rewards = [[] for i in range(env.observation_space.shape[0])]
        values = [[] for i in range(env.observation_space.shape[0])]
        entropies = [[] for i in range(env.observation_space.shape[0])]
        is_done = torch.zeros(env.observation_space.shape[0])
        is_trunc = torch.zeros(env.observation_space.shape[0])
        episode_cum_reward = 0
        #iters= 0
        while(True):
            #iters+=1
            a, log_prob,entropy = self.sample_action(x)
            y,r,d,trunc,infos = env.step(a.numpy())
            is_done += d
            is_trunc += trunc
            V=self.value(torch.from_numpy(x))
            for i,d in enumerate(is_done): 
                if d ==0 :
                    log_probs[i].append(log_prob[i])
                    rewards[i].append(r[i])
                    values[i].append(V[i])
                    entropies[i].append(entropy[i])
                    episode_cum_reward += r
            x=y
            if all(is_done>0) or all(is_trunc>0) :#or iters>maxiter:
                for i in range(len(is_done)):
                    # compute returns-to-go
                    new_returns = []
                    G_t = 0
                    for r in reversed(rewards[i]):
                        G_t = r + self.gamma * G_t
                        new_returns.append(G_t)
                    new_returns = list(reversed(new_returns))
                    returns.extend(new_returns)
                    episodes_sum_of_rewards.append(episode_cum_reward)
                break
        # make loss
        returns = torch.Tensor(returns)
        values = torch.cat([torch.stack(u) for u in values])
        log_probs = torch.cat([torch.stack(u) for u in log_probs])
        entropies = torch.cat([torch.stack(u) for u in entropies])
        advantages = returns - values
        pg_loss = -(advantages.detach() * log_probs).mean()
        entropy_loss = -entropies.mean()
        critic_loss = advantages.pow(2).mean()
        loss = pg_loss + critic_loss + self.entropy_coefficient * entropy_loss
        # gradient step
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return np.mean(episodes_sum_of_rewards),loss,critic_loss, pg_loss

    def train(self, env, nb_rollouts):
        self.value = self.value.train()
        writer = SummaryWriter("runs/lr1e-2-ortho-128-2layers-entropy-4-timelimit")
        self.policy = self.policy.train()
        avg_sum_rewards = []
        pbar = trange(nb_rollouts)
        for ep in pbar:
            avg ,loss,critic_loss, pg_loss= self.one_gradient_step(env)
            avg = avg/env.observation_space.shape[0]
            pbar.set_postfix(avg_return = avg)
            avg_sum_rewards.append(avg)
            writer.add_scalar("Average return",avg , ep)
            writer.add_scalar("Loss",loss , ep)
            writer.add_scalar("Critic loss",critic_loss , ep)
            writer.add_scalar("PG loss",pg_loss , ep)
        self.value = self.value.eval()
        self.policy = self.policy.eval()
        return avg_sum_rewards


In [2]:
%reload_ext tensorboard
%tensorboard --logdir=runs/

In [3]:
import gymnasium as gym
import matplotlib.pyplot as plt

#env = gym.make("LunarLander-v2", render_mode="rgb_array")
#envs = gym.make_vec("CartPole-v1", num_envs=5)
#envs = gym.make_vec("LunarLander-v2", num_envs=20)
import env_hiv
envs = gym.vector.AsyncVectorEnv([lambda: TimeLimit(env_hiv.HIVPatient(domain_randomization=True),200) for i in range(20)])
#envs = gym.vector.AsyncVectorEnv([lambda: gym.make("Acrobot", render_mode="rgb_array") for i in range(20)])
config = {'gamma': .99,
          'learning_rate': 0.01,
          "entropy_coefficient":1e-4
         }
value = valueNetwork(envs)
pi = policyNetwork(envs)
agent = a2c_agent(config, pi,value)
returns = agent.train(envs,1000)
plt.plot(returns)

  8%|▊         | 82/1000 [10:05<1:52:47,  7.37s/it, avg_return=7.11e+6] 

In [None]:
import gymnasium as gym
from gymnasium.utils.save_video import save_video

test_env = gym.make("Acrobot", render_mode="rgb_array_list")
#test_env = gym.make("CartPole-v1", render_mode="rgb_array_list")
s,_ = test_env.reset()
with torch.no_grad():
    for t in range(1000):
        a = pi.sample_action(torch.as_tensor(s))
        s2,r,d,trunc,_ = test_env.step(a)
        s = s2
        if d:
            break

save_video(test_env.render(), "videos", fps=test_env.metadata["render_fps"], name_prefix="reinforce_policy")
from IPython.display import Video
Video("videos/reinforce_policy-episode-0.mp4")

  logger.warn(


Moviepy - Building video /home/tordjx/rl-class-assignment-Tordjx/src/videos/reinforce_policy-episode-0.mp4.
Moviepy - Writing video /home/tordjx/rl-class-assignment-Tordjx/src/videos/reinforce_policy-episode-0.mp4



                                                    

Moviepy - Done !
Moviepy - video ready /home/tordjx/rl-class-assignment-Tordjx/src/videos/reinforce_policy-episode-0.mp4


