In [None]:
import gym
import os
import mujoco_py
from torch import nn
from torch.distributions import normal
import torch
from torch import device
import cv2
import numpy as np
import torch
import time
from torch.optim import Adam
from torch import from_numpy
from torch.optim.lr_scheduler import LambdaLR

import matplotlib.pyplot as plt




In [None]:
class Actor(nn.Module):

    def __init__(self, n_states, n_actions):
        super(Actor, self).__init__()
        self.n_states = n_states
        self.n_actions = n_actions

        self.fc1 = nn.Linear(in_features=self.n_states, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=64)
        self.mu = nn.Linear(in_features=64, out_features=self.n_actions)

        self.log_std = nn.Parameter(torch.zeros(1, self.n_actions))

        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.orthogonal_(layer.weight)
                layer.bias.data.zero_()

    def forward(self, inputs):
        x = inputs
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        mu = self.mu(x)

        std = self.log_std.exp()
        dist = normal.Normal(mu, std)

        return dist


class Critic(nn.Module):
    def __init__(self, n_states):
        super(Critic, self).__init__()
        self.n_states = n_states

        self.fc1 = nn.Linear(in_features=self.n_states, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=64)
        self.value = nn.Linear(in_features=64, out_features=1)

        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.orthogonal_(layer.weight)
                layer.bias.data.zero_()

    def forward(self, inputs):
        x = inputs
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        value = self.value(x)

        return value

In [None]:
class Agent:
    def __init__(self, env_name, n_iter, n_states, action_bounds, n_actions, lr):
        self.env_name = env_name
        self.n_iter = n_iter
        self.action_bounds = action_bounds
        self.n_actions = n_actions
        self.n_states = n_states
        self.device = torch.device("cpu")
        self.lr = lr

        self.current_policy = Actor(n_states=self.n_states,
                                    n_actions=self.n_actions).to(self.device)
        self.critic = Critic(n_states=self.n_states).to(self.device)

        self.actor_optimizer = Adam(self.current_policy.parameters(), lr=self.lr, eps=1e-5)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=self.lr, eps=1e-5)

        self.critic_loss = torch.nn.MSELoss()

        self.scheduler = lambda step: max(1.0 - float(step / self.n_iter), 0)

        self.actor_scheduler = LambdaLR(self.actor_optimizer, lr_lambda=self.scheduler)
        self.critic_scheduler = LambdaLR(self.actor_optimizer, lr_lambda=self.scheduler)

    def choose_dist(self, state):
        state = np.expand_dims(state, 0)
        state = from_numpy(state).float().to(self.device)
        with torch.no_grad():
            dist = self.current_policy(state)

        # action *= self.action_bounds[1]
        # action = np.clip(action, self.action_bounds[0], self.action_bounds[1])

        return dist

    def get_value(self, state):
        state = np.expand_dims(state, 0)
        state = from_numpy(state).float().to(self.device)
        with torch.no_grad():
            value = self.critic(state)

        return value.detach().cpu().numpy()

    def optimize(self, actor_loss, critic_loss):
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.current_policy.parameters(), 0.5)
        # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.actor_optimizer.step()

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        # torch.nn.utils.clip_grad_norm_(self.current_policy.parameters(), 0.5)
        # torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
        self.critic_optimizer.step()

    def schedule_lr(self):
        # self.total_scheduler.step()
        self.actor_scheduler.step()
        self.critic_scheduler.step()

    def save_weights(self, iteration, state_rms):
        torch.save({"current_policy_state_dict": self.current_policy.state_dict(),
                    "critic_state_dict": self.critic.state_dict(),
                    "actor_optimizer_state_dict": self.actor_optimizer.state_dict(),
                    "critic_optimizer_state_dict": self.critic_optimizer.state_dict(),
                    "actor_scheduler_state_dict": self.actor_scheduler.state_dict(),
                    "critic_scheduler_state_dict": self.critic_scheduler.state_dict(),
                    "iteration": iteration,
                    "state_rms_mean": state_rms.mean,
                    "state_rms_var": state_rms.var,
                    "state_rms_count": state_rms.count}, self.env_name + "_weights.pth")

    def load_weights(self):
        checkpoint = torch.load(self.env_name + "_weights.pth")
        self.current_policy.load_state_dict(checkpoint["current_policy_state_dict"])
        self.critic.load_state_dict(checkpoint["critic_state_dict"])
        self.actor_optimizer.load_state_dict(checkpoint["actor_optimizer_state_dict"])
        self.critic_optimizer.load_state_dict(checkpoint["critic_optimizer_state_dict"])
        self.actor_scheduler.load_state_dict(checkpoint["actor_scheduler_state_dict"])
        self.critic_scheduler.load_state_dict(checkpoint["critic_scheduler_state_dict"])
        iteration = checkpoint["iteration"]
        state_rms_mean = checkpoint["state_rms_mean"]
        state_rms_var = checkpoint["state_rms_var"]

        return iteration, state_rms_mean, state_rms_var

    def set_to_eval_mode(self):
        self.current_policy.eval()
        self.critic.eval()

    def set_to_train_mode(self):
        self.current_policy.train()
        self.critic.train()

In [None]:
ENV_NAME = "Walker2d"
test_env = gym.make("Walker2d-v2") 

n_states = test_env.observation_space.shape[0]
action_bounds = [test_env.action_space.low[0], test_env.action_space.high[0]]
n_actions = test_env.action_space.shape[0]

n_iterations = 500
lr = 3e-4
epochs = 10
clip_range = 0.2
mini_batch_size = 64
T = 5000

agent = Agent(n_states=n_states,
                  n_iter=n_iterations,
                  env_name=ENV_NAME,
                  action_bounds=action_bounds,
                  n_actions=n_actions,
                  lr=lr)


env = gym.make(ENV_NAME + "-v2")

agent = Agent(n_states=n_states,
n_iter=n_iterations,
env_name=ENV_NAME,
action_bounds=action_bounds,
n_actions=n_actions,
lr=lr)



In [None]:
_,state_rms_mean,state_rms_var=agent.load_weights()
agent.set_to_eval_mode()

In [None]:
num_episodes = 100  # Number of evaluation episodes
results=[]
for episode in range(num_episodes):
        state = env.reset()[0]
        done = False
        cumulative_reward = 0
        
        while not done:
            state = np.clip((state - state_rms_mean) / (state_rms_var ** 0.5 + 1e-8), -5.0, 5.0)
            dist = agent.choose_dist(state)
            action = dist.sample().cpu().numpy()
            next_state, reward, done, _ = env.step(action)[:4]
            state = next_state
            cumulative_reward += reward
            
        results.append(cumulative_reward)


In [None]:
print(f"Average Return for ppo: {np.mean(results)}")
print(f"Standard Deviation of Return for ppo: {np.std(results)}")
print(f"Variance of Return for ppo: {np.var(results)}")

In [None]:
perturbation_scale = 0.1
perturbed_rewards = []

for episode in range(num_episodes):
    state = env.reset()
    done = False
    cumulative_reward = 0
    
    while not done:
        state = np.clip((state - state_rms_mean) / (state_rms_var ** 0.5 + 1e-8), -5.0, 5.0)
        dist = agent.choose_dist(state)
        action = dist.sample().cpu().numpy()
        action += np.random.normal(0, perturbation_scale, size=action.shape)  # Perturb the action
        next_state, reward, done, _ = env.step(action)[:4]
        state = next_state
        cumulative_reward += reward
    
    perturbed_rewards.append(cumulative_reward)

perturbed_rewards = np.array(perturbed_rewards)
print(f"Average Return under Perturbation for ppo: {np.mean(perturbed_rewards)}")
print(f"Standard Deviation under Perturbation for ppo: {np.std(perturbed_rewards)}")


In [None]:
rewards= np.load('ppo_reward.npy')
times=np.load('ppo_time.npy')

In [None]:
early_performance = np.mean(rewards[:int(len(rewards) * 0.1)])
print(f"Early Performance for ppo: {early_performance}")


late_performance = rewards[-int(len(rewards) * 0.1):]
variance_late_performance = np.var(late_performance)
print(f"Variance of Late Performance for ppo: {variance_late_performance}")


total_time = sum(times)
average_time_per_episode = np.mean(times)
print(f"Total Training Time for ppo: {total_time} seconds")
print(f"Average Time per Episode for ppo: {average_time_per_episode} seconds")


In [None]:
plt.figure(figsize=(12, 6))
plt.plot(rewards)
plt.xlabel('Episodes')
plt.ylabel('Rewards')
plt.title('Episode vs Reward')
plt.show()

In [None]:

env = gym.make('Walker2d-v2')


frames = []

state = env.reset()
done = False
cumulative_reward = 0

while not done:
    # Capture frame
    frame = env.render(mode='rgb_array')
    frames.append(frame)

    state = np.clip((state - state_rms_mean) / (state_rms_var ** 0.5 + 1e-8), -5.0, 5.0)
    dist = agent.choose_dist(state)
    action = dist.sample().cpu().numpy()
    next_state, reward, done, _ = env.step(action)[:4]
    state = next_state
    cumulative_reward += reward

env.close()

output_video_path = 'ppo_output_video.avi'
frame_height, frame_width, _ = frames[0].shape
fourcc = cv2.VideoWriter_fourcc(*'XVID')
video_writer = cv2.VideoWriter(output_video_path, fourcc, 30, (frame_width, frame_height))

for frame in frames:
    video_writer.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

video_writer.release()
print(f"Video saved to {output_video_path}")
