# PPO
---

### 1. Import the Necessary Packages

In [1]:
%matplotlib inline
import wandb
import torch
from model.ppo_3 import PPO
import numpy as np
from env.wrappers import LunarContinuous, LunarLanderWithUnknownWind,LunarLanderWithKnownWind
from utils.logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 0 ,  
    'val_freq': 10,
    'val_iter': 10,
    'env': LunarLanderWithKnownWind
}

Initialise wandb session in the code cell below.

In [3]:
VAL_ITER = 100
MAX_RUN_COUNT = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'mean_validation_reward',
        'goal': 'maximize'
    },
    'parameters': {
        'actor_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.009
        },
        # 'critic_lr': {
        #     "distribution": "uniform",
        #     "min": 1e-5,
        #     "max": 0.1
        # },
        'adp_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.009
        },
        # 'gamma': {
        #     'min': 0.9,
        #     'max': 1.
        # },
        # 'lam': {
        #     'min': 0.9,
        #     'max': 1.
        # },
        'max_grad_norm': {
            "min": 0.1,
            "max": 1., 
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'num_envs': {
            'values': list(range(1, 50))
        },
        'anneal_lr': {
            'values': [True, False]
        },
        # 'num_steps': {
        #     'distribution': 'q_uniform',
        #     'min': 300,
        #     'max': 4000,
        #     'q': 100
        # },
        # 'adp_num_steps': {
        #     'distribution': 'q_uniform',
        #     'min': 200,
        #     'max': 1000,
        #     'q': 10
        # },
        'num_steps': {
            'distribution': 'int_uniform',
            'min': 100,
            'max': 1500
        },
        'adp_num_steps': {
            'distribution': 'int_uniform',
            'min': 100,
            'max': 800
        },
        'anneal_discount': {
            'distribution': 'q_uniform',
            'min': 1,
            'max': 1000,
            'q': 10
        },
        'n_sgd_batches': {
            'distribution': 'q_uniform',
            "min": 1,     # 2^0
            "max": 1024,  # 2^10
            "q": 2 
        }
    }
}

In [4]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = PPO(logger,**misc_hyperparameters) if wandb.config is None else PPO(summary_writter=logger, **wandb.config, **misc_hyperparameters)
    ppo.train()

    base_val_rews, base_val_dur = ppo.validate(VAL_ITER, False, False)
    adp_val_rews, adp_val_dur = ppo.validate(VAL_ITER, False, True)
    wind_vals, base_z, adpt_z = ppo.validate_encoders()
    ppo.env.close()

    # hist, bin_edges = np.histogram(val_rews, bins=20)
    # print(f"Len bin_edges: {len(bin_edges)}")
    # Debugging prints
    wandb.log({
        "validation_rewards": base_val_rews,
        "mean_validation_reward": np.mean(base_val_rews),
        "validation_duration": base_val_dur,
        "maximum_base_validation_reward": np.max(base_val_rews),
        "adp_validation_rewards": adp_val_rews,
        "adp_validation_duration": adp_val_dur,
        "adp_maximum_base_validation_reward": np.max(adp_val_rews),

        "Encoder Outputs": wandb.plot.line_series(
                xs=wind_vals,
                ys=[base_z, adpt_z],
                keys=["Base Encoder", "Adaptive Encoder"],
                title="Encoder Outputs vs Wind",
                xname="Wind Value"
            )

        # "max_reward_video": wandb.Video(f"./videos/rl-video{np.argmax(adp_val_rews)}-episode-{np.argmax(adp_val_rews)}.mp4", fps=4, format="mp4")
    })
    logger.save_histogram(base_val_rews, "Base Validation Rewards")
    logger.save_histogram(base_val_dur, "Base Validation Duration")
    logger.save_histogram(adp_val_rews, "Adaptive Validation Rewards")
    logger.save_histogram(adp_val_dur, "Adaptive Validation Duration")
    logger.save_model(ppo.policy, "base_model")
    logger.save_model(ppo.adapt_policy, "adp_model")

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [5]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model, count=MAX_RUN_COUNT)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: xoxts7ry
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/xoxts7ry


[34m[1mwandb[0m: Agent Starting Run: 96hsmq2q with config:
[34m[1mwandb[0m: 	actor_lr: 0.002843456264033954
[34m[1mwandb[0m: 	adp_lr: 0.0013946050760227774
[34m[1mwandb[0m: 	adp_num_steps: 162
[34m[1mwandb[0m: 	anneal_discount: 810
[34m[1mwandb[0m: 	anneal_lr: False
[34m[1mwandb[0m: 	max_grad_norm: 0.400312904518112
[34m[1mwandb[0m: 	n_sgd_batches: 1006
[34m[1mwandb[0m: 	n_updates_per_iteration: 7
[34m[1mwandb[0m: 	num_envs: 12
[34m[1mwandb[0m: 	num_steps: 633
[34m[1mwandb[0m: Currently logged in as: [33mmohamedrostom[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin



-------------------- Iteration #1 --------------------
Average Episodic Return: -318.14
Average Actor Loss: -0.16976
Average Critic Loss: 537.7633963299121
Average KL Divergence: 0.057129782401028506
Iteration took: 35.05 secs, of which rollout took 2.78 secs and gradient updates took 32.27 secs
Current actor learning rate: 0.002843456264033954
Current critic learning rate: 0.002843456264033954
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -466.43
Average Actor Loss: -0.19381
Average Critic Loss: 656.180673928231
Average KL Divergence: 0.034101993665184356
Iteration took: 35.51 secs, of which rollout took 2.71 secs and gradient updates took 32.79 secs
Current actor learning rate: 0.002843456264033954
Current critic learning rate: 0.002843456264033954
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Return: -275.38
Av

0,1
actor_learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
adp_learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
adp_maximum_base_validation_reward,▁
average_actor_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
average_adapt_loss,██▆▆▅▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁
average_critic_loss,█▆▆▅▅▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
average_episode_rewards,▁▅▅▅▆▆▆▆▇▆▇▇▇▇▇▇███▇█▇█▇████▇█▇▇▇▇▇▇▇█▇▇
critic_learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
iteration_compute,▅▅▅▅▅▅▆█▆█▇▆▆▆▆▆▆▅▆▆▆▇▆▆▆▅▅▅▇▅▅▇▁▁▁▁▂▁▂▁
maximum_base_validation_reward,▁

0,1
actor_learning_rate,0.00284
adp_learning_rate,0.00139
adp_maximum_base_validation_reward,230.90439
average_actor_loss,-0.20505
average_adapt_loss,31.64977
average_critic_loss,74.88878
average_episode_rewards,123.21326
critic_learning_rate,0.00284
iteration_compute,22.59974
maximum_base_validation_reward,222.15281


[34m[1mwandb[0m: Agent Starting Run: 8fim0vr6 with config:
[34m[1mwandb[0m: 	actor_lr: 0.0056686456129156765
[34m[1mwandb[0m: 	adp_lr: 0.0006122672225982947
[34m[1mwandb[0m: 	adp_num_steps: 233
[34m[1mwandb[0m: 	anneal_discount: 590
[34m[1mwandb[0m: 	anneal_lr: True
[34m[1mwandb[0m: 	max_grad_norm: 0.6483368782972401
[34m[1mwandb[0m: 	n_sgd_batches: 58
[34m[1mwandb[0m: 	n_updates_per_iteration: 14
[34m[1mwandb[0m: 	num_envs: 32
[34m[1mwandb[0m: 	num_steps: 288



-------------------- Iteration #1 --------------------
Average Episodic Return: -214.56
Average Actor Loss: -0.14719
Average Critic Loss: 778.9089217930789
Average KL Divergence: 0.03680762658700036
Iteration took: 5.89 secs, of which rollout took 2.12 secs and gradient updates took 3.77 secs
Current actor learning rate: 0.0056686456129156765
Current critic learning rate: 0.0056686456129156765
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -178.82
Average Actor Loss: -0.18147
Average Critic Loss: 647.2260967095838
Average KL Divergence: 0.02438050328448588
Iteration took: 6.15 secs, of which rollout took 2.19 secs and gradient updates took 3.95 secs
Current actor learning rate: 0.005660430184491161
Current critic learning rate: 0.005660430184491161
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Return: -160.76
Avera

0,1
actor_learning_rate,███▇▇▇▇▆▆▆▄▄▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
adp_learning_rate,██████▇▇▇▇▆▆▆▆▅▅▄▄▄▃▃▂▂▁▁
adp_maximum_base_validation_reward,▁
average_actor_loss,█▅▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
average_adapt_loss,█▆▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁
average_critic_loss,█▆▅▅▄▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
average_episode_rewards,▁▁▂▂▂▄▅▆▇▇▆▇▇▇▇▇▇███████████████████████
critic_learning_rate,█████▇▇▇▆▆▅▄▃▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
iteration_compute,▂▃▃▃▅▆▆▇▇▆▇▇▆▇▆▇▆▆█▆▇▆▆▆▆▆▆▆▅▆▆▆▆▆▆█▁▁▁▁
maximum_base_validation_reward,▁

0,1
actor_learning_rate,0.0
adp_learning_rate,0.00037
adp_maximum_base_validation_reward,208.41161
average_actor_loss,-0.21462
average_adapt_loss,88.36157
average_critic_loss,47.23237
average_episode_rewards,178.06905
critic_learning_rate,0.0
iteration_compute,4.71582
maximum_base_validation_reward,210.87888


[34m[1mwandb[0m: Agent Starting Run: wluc7sls with config:
[34m[1mwandb[0m: 	actor_lr: 0.0020207050083377357
[34m[1mwandb[0m: 	adp_lr: 0.0005221344588285969
[34m[1mwandb[0m: 	adp_num_steps: 523
[34m[1mwandb[0m: 	anneal_discount: 260
[34m[1mwandb[0m: 	anneal_lr: True
[34m[1mwandb[0m: 	max_grad_norm: 0.23441601810215737
[34m[1mwandb[0m: 	n_sgd_batches: 754
[34m[1mwandb[0m: 	n_updates_per_iteration: 7
[34m[1mwandb[0m: 	num_envs: 48
[34m[1mwandb[0m: 	num_steps: 1300



-------------------- Iteration #1 --------------------
Average Episodic Return: -450.87
Average Actor Loss: -0.18977
Average Critic Loss: 1173.4915296094262
Average KL Divergence: 0.0165544277466302
Iteration took: 37.12 secs, of which rollout took 14.81 secs and gradient updates took 22.3 secs
Current actor learning rate: 0.0020207050083377357
Current critic learning rate: 0.0020207050083377357
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -438.23
Average Actor Loss: -0.20375
Average Critic Loss: 998.6993634643721
Average KL Divergence: 0.011855490853965533
Iteration took: 37.57 secs, of which rollout took 15.0 secs and gradient updates took 22.57 secs
Current actor learning rate: 0.002015091938870131
Current critic learning rate: 0.002015091938870131
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Return: -365.37


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


In [None]:
api = wandb.Api()

# Fetch the sweep object
sweep = api.sweep(f"pmsaraiva2712-tum/lunar/{sweep_id}")

# Fetch all runs from the sweep
runs = sweep.runs

# Sort runs by the metric you are optimizing for, e.g., 'val_loss'
best_run = sorted(runs, key=lambda run: run.summary.get('val_rewards', float('-inf')), reverse=True)[0]

# Extract best hyperparameters and metrics
best_params = best_run.config
best_metrics = best_run.summary

# Print the best hyperparameters and metrics
print("Best Hyperparameters:", best_params)
print("Best Metrics:", best_metrics)

Best Hyperparameters: {'adp_lr': 0.09632103099393384, 'batches': 570, 'actor_lr': 0.09512195454699128, 'num_envs': 38, 'anneal_lr': True, 'num_steps': 3, 'adp_num_steps': 7, 'max_grad_norm': 0.2936539784109834, 'anneal_discount': 850, 'n_updates_per_iteration': 12}
Best Metrics: {'_wandb': {'runtime': 0}}


In [7]:
import numpy as np
import matplotlib.pyplot as plt

# Example NumPy array
data = np.random.randint(0, 100, size=100)  # Random integers between 0 and 100

# Calculate the median
median_value = np.median(data)

# Create histogram
num_bins = 10  # Number of buckets
plt.hist(data, bins=num_bins, edgecolor='black', alpha=0.7, label='Data')

# Add a vertical line for the median
plt.axvline(median_value, color='red', linestyle='dashed', linewidth=2, label=f'Median = {median_value}')

# Add labels, title, and legend
plt.xlabel('Value Range')
plt.ylabel('Frequency')
plt.title('Histogram with Buckets and Median')
plt.legend()

# Show the plot
plt.show()

KeyboardInterrupt: 