# CPR appropriation with policy gradient

This notebook contains actual Harvest trainings for each implemented policy gradient method. The environment in use is a custom implementation of Harvest.

## Pre-requisites

The cells down below install and import the necessary libraries to successfully run the notebook examples.

In [None]:
import sys
sys.path.append('../')

In [None]:
%%capture
!pip install -r ../init/requirements.txt
!pip install ../src/gym_cpr_grid

In [None]:
import time

import numpy as np
import gym
import torch
import matplotlib.pyplot as plt
from IPython import display

from src import memory, models, policies

%load_ext autoreload
%autoreload 2

## Utilities

The cell down below defines the environment, along with common variables to be used throughout the notebook.

In [None]:
env = gym.make(
    'gym_cpr_grid:CPRGridEnv-v0', 
    n_agents=1, 
    grid_width=25, 
    grid_height=7,
    tagging_ability=False,
    gifting_mechanism=None,
    initial_resource_probability=0.2,
    fov_squares_front=21,
    global_obs=True
)

In [None]:
observation_space_size = env.observation_space_size()
action_space_size = env.action_space_size()
epochs = 1000
steps_per_epoch = 1000
minibatch_size = 100
save_every = 200
hidden_sizes = [32, 32]
checkpoints_path = "../checkpoints"
render_every = None
std_returns = True
std_advs = False
clip_gradient_norm = None
policy_lr = 3e-4
baseline_lr = 1e-3
wandb_config = {
    "api_key": open("../wandb_api_key_file", "r").read().strip(),
    "project": "cpr-appropriation",
    "entity": "wadaboa",
}

## VPG

This section deals with training a set of Harvest agents using our custom Vanilla Policy Gradient implementation.

In [None]:
vpg_policy_nn = models.MLP(observation_space_size, hidden_sizes, action_space_size)
vpg_baseline_nn = models.MLP(observation_space_size, hidden_sizes, 1, log_softmax=False)
vpg_policy = policies.VPGPolicy(env, vpg_policy_nn, baseline_nn=vpg_baseline_nn)
vpg_policy.train(
    epochs,
    steps_per_epoch,
    minibatch_size,
    enable_wandb=True,
    wandb_config={**wandb_config, "group": "VPG"},
    save_every=save_every,
    checkpoints_path=checkpoints_path,
    render_every=render_every,
    std_returns=std_returns,
    std_advs=std_advs,
    clip_gradient_norm=clip_gradient_norm,
    policy_lr=policy_lr,
    baseline_lr=baseline_lr
)

## TRPO

This section deals with training a set of Harvest agents using our custom Trust Region Policy Optimization implementation.

In [None]:
beta = 1.0
kl_target = 0.01

In [None]:
trpo_policy_nn = models.MLP(observation_space_size, hidden_sizes, action_space_size)
trpo_baseline_nn = models.MLP(observation_space_size, hidden_sizes, 1, log_softmax=False)
trpo_policy = policies.TRPOPolicy(env, trpo_policy_nn, trpo_baseline_nn, beta=beta, kl_target=kl_target)
trpo_policy.train(
    epochs,
    steps_per_epoch,
    minibatch_size,
    enable_wandb=True,
    wandb_config={**wandb_config, "group": "TRPO"},
    save_every=save_every,
    checkpoints_path=checkpoints_path,
    render_every=render_every,
    std_returns=std_returns,
    std_advs=std_advs,
    clip_gradient_norm=clip_gradient_norm,
    policy_lr=policy_lr,
    baseline_lr=baseline_lr
)

## PPO

This section deals with training a set of Harvest agents using our custom Proximal Policy Optimization implementation.

In [None]:
alpha = 0.5
beta = 0.01
eps = 0.2

In [None]:
ppo_policy_nn = models.MLP(observation_space_size, hidden_sizes, action_space_size)
ppo_baseline_nn = models.MLP(observation_space_size, hidden_sizes, 1, log_softmax=False)
ppo_policy = policies.PPOPolicy(env, ppo_policy_nn, ppo_baseline_nn, alpha=alpha, beta=beta, eps=eps)
ppo_policy.train(
    epochs,
    steps_per_epoch,
    minibatch_size,
    enable_wandb=True,
    wandb_config={**wandb_config, "group": "PPO"},
    save_every=save_every,
    checkpoints_path=checkpoints_path,
    render_every=render_every,
    std_returns=std_returns,
    std_advs=std_advs,
    clip_gradient_norm=clip_gradient_norm,
    policy_lr=policy_lr,
    baseline_lr=baseline_lr
)

## Evaluation

In this section we are evaluating one of the trained models on the Harvest environment and visualizing results through custom rendering functions.

In [None]:
policy = ppo_policy
checkpoint = ""
policy.load(checkpoint)

In [None]:
policy.policy_nn.eval()
policy.baseline_nn.eval()
observations = env.reset()
prev_actions = {h: None for h in range(env.n_agents)}
for _ in range(env._max_episode_steps):
    env.render()
    action_dict = dict()
    for agent_handle in range(env.n_agents):
        legal_actions = env.get_legal_actions(agent_handle)
        probs = np.exp(
            ppo_policy.policy_nn(
                torch.tensor(observations[agent_handle], dtype=torch.float32), 
                torch.tensor(legal_actions, dtype=torch.int64)
            ).cpu().detach().numpy()
        )
        action = np.argmax(probs)
        if prev_actions[agent_handle] == 0 and action == 1:
            probs[1] = 0
        if prev_actions[agent_handle] == 1 and action == 0:
            probs[0] = 0
        if prev_actions[agent_handle] == 2 and action == 3:
            probs[3] = 0
        if prev_actions[agent_handle] == 3 and action == 2:
            probs[2] = 0
        if prev_actions[agent_handle] == 4 and action == 5:
            probs[5] = 0
        if prev_actions[agent_handle] == 5 and action == 4:
            probs[4] = 0
        if prev_actions[agent_handle] == action:
            probs[action] == 0
        action = np.argmax(probs)
        prev_actions[agent_handle] = action

    print(env.agent_positions)
    action_dict = {0: action}
    print(action_dict)
    observations, rewards, dones, infos = env.step(action_dict)
    print(infos)
    display.clear_output(wait=True)
    #time.sleep(3)
env.close()