# Reinforcement
In this notebook, we are going to be focusing on the QAgent and the PQAgent in the reinforcement module in PAI-Utils.

## Import Packages

In [1]:
import gym
import numpy as np

from paiutils import reinforcement as rl

## Create Environment

We are going to be testing with the Taxi-v3 environment. For more information on this environment, click [this](https://gym.openai.com/envs/Taxi-v3/).

In [2]:
genv = gym.make('Taxi-v3')
max_steps = genv._max_episode_steps
print(max_steps)
print(genv.observation_space, genv.action_space)

env = rl.GymWrapper(genv)

200
Discrete(500) Discrete(6)


## QAgent

### Create Agent

In [3]:
policy = rl.StochasticPolicy(
    rl.GreedyPolicy(), rl.ExponentialDecay(1, 0.001, 0.1),
    0, env.action_size
)
discounted_rate = .9
agent = rl.QAgent(
    env.discrete_state_space, env.action_size,
    policy, discounted_rate
)

### Train the Agent

In [4]:
agent.set_playing_data(
    training=True, learning_rate=.5, verbose=False
)
num_episodes = 10000
result = env.play_episodes(
    agent, num_episodes, max_steps,
    verbose=False, episode_verbose=False,
    render=False
)

save_dir = ''
path = agent.save(save_dir, note=f'QAgent_{result}')

### Test the Agent

In [5]:
agent.set_playing_data(training=False)
result = env.play_episodes(
    agent, 1, max_steps,
    verbose=False, episode_verbose=False,
    render=True
)
print(result)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| :[43m [0m|B: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m|[43m [0m: |B: |
+---------+
  (West)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : 

#### Multiple Trials
By testing the agent once we are not guaranteed a true representation of how good the agent is. Therefore, we will conduct mutliple trials of 100 episodes.

In [6]:
num_trials = 1000
num_episodes = 100
results = []
for _ in range(num_trials):
    result = env.play_episodes(
        agent, num_episodes, max_steps,
        verbose=False, episode_verbose=False,
        render=False
    )
    results.append(result)
print(f'Mean: {np.mean(results)} - '
      f'Median: {np.median(results)} - '
      f'Std Dev.: {np.std(results)} - '
      f'Max: {np.max(results)} - '
      f'Min: {np.min(results)}')

Mean: 7.914860000000001 - Median: 7.91 - Std Dev.: 0.2511067111807249 - Max: 8.6 - Min: 7.21


## PQAgent

### Create Agent

In [7]:
policy = rl.StochasticPolicy(
    rl.GreedyPolicy(), rl.ExponentialDecay(1, .1, .3),
    0, env.action_size
)
discounted_rates = [.9, .8, .91, .92, .93, .94, .95, .96, .97, .98, .99]
learning_rates = [.4, .1, .2]
agent = rl.PQAgent(
    env.discrete_state_space, env.action_size,
    policy, discounted_rates, learning_rates
)

### Train the Agent

In [8]:
agent.set_playing_data(
    training=True, verbose=False
)
num_episodes = 20000
result = env.play_episodes(
    agent, num_episodes, max_steps,
    verbose=False, episode_verbose=False,
    render=False
)

save_dir = ''
path = agent.save(save_dir, note=f'PQAgent_{result}')

### Test the Agent

In [9]:
agent.set_playing_data(training=False)
result = env.play_episodes(
    agent, 1, max_steps,
    verbose=False, episode_verbose=True,
    render=True
)
print(result)

+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+

Step: 1 - Reward: -1 - Action: 4
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (Pickup)
Step: 2 - Reward: -1 - Action: 0
+---------+
|R: | : :G|
|[42m_[0m: | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
Step: 3 - Reward: -1 - Action: 0
+---------+
|R: | : :G|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
Step: 4 - Reward: -1 - Action: 0
+---------+
|R: | : :G|
| : | : : |
| : : : : |
|[42m_[0m| : | : |
|[35mY[0m| : |B: |
+---------+
  (South)
Step: 5 - Reward: -1 - Action: 0
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (South)
Step: 6 - Reward: 20 - Action: 5
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
15

#### Multiple Trials
By testing the agent once we are not guaranteed a true representation of how good the agent is. Therefore, we will conduct mutliple trials of 100 episodes.

In [10]:
num_trials = 100
num_episodes = 100
for dndx in range(len(discounted_rates)):
    for lndx in range(len(learning_rates)):
        agent.set_playing_data(
            training=False,
            discounted_rate_ndx=dndx,
            learning_rate_ndx=lndx
        )
        results = []
        for _ in range(num_trials):
            result = env.play_episodes(
                agent, num_episodes, max_steps,
                verbose=False, episode_verbose=False,
                render=False
            )
            results.append(result)
        print(dndx, lndx)
        print(f'Mean: {np.mean(results)} - '
              f'Median: {np.median(results)} - '
              f'Std Dev.: {np.std(results)} - '
              f'Max: {np.max(results)} - '
              f'Min: {np.min(results)}')

0 0
Mean: 7.898299999999998 - Median: 7.93 - Std Dev.: 0.24607338336358114 - Max: 8.49 - Min: 7.26
0 1
Mean: -64.0718 - Median: -63.845 - Std Dev.: 9.903752054650802 - Max: -36.93 - Min: -86.89
0 2
Mean: -3.3857999999999997 - Median: -2.2249999999999996 - Std Dev.: 4.727937008886646 - Max: 4.07 - Min: -17.21
1 0
Mean: 7.482500000000001 - Median: 7.855 - Std Dev.: 1.0020113522311012 - Max: 8.72 - Min: 3.89
1 1
Mean: -148.78570000000002 - Median: -149.26 - Std Dev.: 9.365496169984803 - Max: -113.33 - Min: -170.45
1 2
Mean: -93.8293 - Median: -93.065 - Std Dev.: 11.819510671343377 - Max: -65.72 - Min: -128.48
2 0
Mean: 7.960599999999999 - Median: 7.925 - Std Dev.: 0.22539662819128417 - Max: 8.43 - Min: 7.46
2 1
Mean: -49.073199999999986 - Median: -48.44 - Std Dev.: 8.442863954843759 - Max: -24.47 - Min: -70.51
2 2
Mean: 1.9801000000000004 - Median: 1.815 - Std Dev.: 3.3045815151089863 - Max: 8.29 - Min: -8.61
3 0
Mean: 7.9664 - Median: 7.97 - Std Dev.: 0.23389536121949916 - Max: 8.55 - Mi