# PPO (Proximal Policy Optimization)

<img src="ppo1.png">
<img src="ppo2.png">

In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optimizers
from torch.distributions import Categorical

### Задание 1. Заполните пропуски в методе __train_net__

In [None]:
class PPO(nn.Module):
    def __init__(self, learning_rate, gamma, eps_clip):
        super(PPO, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(4, 256)
        self.fc_pi = nn.Linear(256, 2)
        self.fc_v = nn.Linear(256, 1)
        self.optimizer = optimizers.Adam(self.parameters(), lr=learning_rate)
        self.gamma = gamma
        self.eps_clip = eps_clip

    def pi(self, x, softmax_dim=0):
        """
        define computation graph for pi
        :param x: input
        :param softmax_dim:
        :return:
        """
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob

    def v(self, x):
        """
        define computation graph for v
        :param x: вход
        :return:
        """
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v

    def put_data(self, transition):
        """
        memorizing transitions
        :param transition:
        :return:
        """
        self.data.append(transition)

    def make_batch(self):
        """
        we have already seen, almost all the errors that you can get creating batches,
        so don't bother yourself coding this :)
        :return:
        """
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []

        for transition in self.data:
            s, a, r, s_prime, prob_a, done = transition

            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            done_mask = 0 if done else 1
            done_lst.append([done_mask])

        s = torch.tensor(s_lst, dtype=torch.float)
        a = torch.tensor(a_lst)
        r = torch.tensor(r_lst)
        s_prime = torch.tensor(s_prime_lst, dtype=torch.float)
        done_mask = torch.tensor(done_lst, dtype=torch.float)
        prob_a = torch.tensor(prob_a_lst)
        self.data = []
        return s, a, r, s_prime, done_mask, prob_a

    def train_net(self, epochs):
        """
        training function
        :param epochs: number of epochs
        :return:
        """
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()

        for i in range(epochs):
            # compute td
            # td_target =
            ###### Your code here ##########
            raise NotImplementedError
            ################################
            delta = td_target - self.v(s)
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            # compute advantage_lst (pay attention to the order)
            ###### Your code here ##########
            raise NotImplementedError
            ################################
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)

            # getting pi_a
            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1, a)

            # computes first part of surrogate function
            # surr1 =
            ###### Your code here ##########
            raise NotImplementedError
            ################################

            # second part
            # surr2 = torch.clamp(pi_a/prob_a, 1 + eps, 1 - eps) * adv 
            # pi_a/prob_a == exp(log(pi_a) - log(prob_a))
            ###### Your code here ##########
            raise NotImplementedError
            ################################
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s), td_target.detach())

            # make optimizer step, just copy/paste it from previous seminar :)
            ###### Your code here ##########
            raise NotImplementedError
            ################################



In [None]:
class RewardScalingWrapper(gym.RewardWrapper):
    def __init__(self, env, scale):
        super().__init__(env)
        self.scale = scale

    def reward(self, reward):
        return reward * self.scale


def run(episodes_number=10000, print_interval=20, reward_scale_f=0.01, epochs=3, time_horizon=20,
       learning_rate=0.0005, gamma=0.98, eps_clip=0.1):
    # adding reward scaling via wrapper
    env = RewardScalingWrapper(gym.make('CartPole-v1'), scale=reward_scale_f)
    model = PPO(learning_rate, gamma, eps_clip)
    score = 0.0

    for n_epi in range(episodes_number):
        s = env.reset()
        done = False
        while not done:
            for t in range(time_horizon):
                # noinspection PyUnresolvedReferences
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()
                s_prime, r, done, info = env.step(a)

                # noinspection PyUnresolvedReferences
                model.put_data((s, a, r, s_prime, prob[a].item(), done))
                s = s_prime

                score += r
                if done:
                    break

            # noinspection PyUnresolvedReferences
            model.train_net(epochs)

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.1f}".format(n_epi, score / print_interval / reward_scale_f))
            score = 0.0

    env.close()

In [None]:
run()

### Задание 2. Проверьте работу алгоритма с различными гиперпараметрами (epochs, time_horizon, learning_rate, eps_clip)  и постройте графики

### Бонус. Добавьте entropy exploration bonus
<img src="entropy.png">

### Дополнительные материалы
[Лекция 8. Оптимизация градиента стратегии](https://yadi.sk/i/gTNlK0m4_A1U7Q)

[Bootcamp 2017 - Shulman - NPG, TRPO, PPO](https://yadi.sk/i/E0Ua9lmEzUPsrQ)

[Openai baselines ppo](https://openai.com/blog/openai-baselines-ppo/)

[Proximal policy optimization with sonic the hedgehog 2](https://towardsdatascience.com/proximal-policy-optimization-ppo-with-sonic-the-hedgehog-2-and-3-c9c21dbed5e)
