# PPO
---

### 1. Import the Necessary Packages

In [1]:
%matplotlib inline
import wandb

from model.ppo_3 import PPO
import numpy as np
from env.wrappers import LunarContinuous, LunarLanderWithUnknownWind,LunarLanderWithKnownWind
from utils.logger import WandbSummaryWritter

### 2. Instantiate the Model

Setup the hyperparameters in the code cell below.

In [2]:
misc_hyperparameters = {
    'save_freq': 0 ,  
    'val_freq': 10,
    'val_iter': 10,
    'env': LunarLanderWithUnknownWind
}

Initialise wandb session in the code cell below.

In [3]:
MAX_TOTAL_TIMESTEPS_TO_TRAIN = 500
VAL_ITER = 30
MAX_RUN_COUNT = 30
sweep_config = {
    'method': 'bayes', 
    'metric': {
        'name': 'validation_rewards',
        'goal': 'maximize'
    },
    'parameters': {
        'actor_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'critic_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'adp_lr': {
            "distribution": "uniform",
            "min": 1e-5,
            "max": 0.1
        },
        'gamma': {
            'min': 0.9,
            'max': 1.
        },
        'lam': {
            'min': 0.9,
            'max': 1.
        },
        'max_grad_norm': {
            "min": 0.1,
            "max": 10.0, 
        },
        'n_updates_per_iteration': {
            'values': list(range(1, 21))
        },
        'num_envs': {
            'values': list(range(1, 100))
        },
        'anneal_lr': {
            'values': [True, False]
        },
        'num_steps': {
            'distribution': 'q_uniform',
            'min': 300,
            'max': 4000,
            'q': 100
        },
        'adp_num_steps': {
            'distribution': 'q_uniform',
            'min': 200,
            'max': 1000,
            'q': 10
        },
        'anneal_discount': {
            'distribution': 'q_uniform',
            'min': 1,
            'max': 1000,
            'q': 10
        },
        'batches': {
            'distribution': 'q_uniform',
            "min": 1,     # 2^0
            "max": 1024,  # 2^10
            "q": 2 
        }
    }
}

In [4]:
def train_model(config = None):
    logger = WandbSummaryWritter(project='lunar', config =config)
    ppo = PPO(logger,**misc_hyperparameters) if config is None else PPO(summary_writter=logger, **config, **misc_hyperparameters)
    ppo.train()

    base_val_rews, base_val_dur = ppo.validate(VAL_ITER, False, False)
    adp_val_rews, adp_val_dur = ppo.validate(VAL_ITER, False, True)
    # wind_vals, base_z, adpt_z = ppo.validate_encoders()
    ppo.env.close()

    # hist, bin_edges = np.histogram(val_rews, bins=20)
    # print(f"Len bin_edges: {len(bin_edges)}")
    # Debugging prints

    wandb.log({
        "validation_rewards": base_val_rews,
        "validation_duration": base_val_dur,
        "maximum_base_validation_reward": np.max(base_val_rews),
        "adp_validation_rewards": adp_val_rews,
        "adp_validation_duration": adp_val_dur,
        "adp_maximum_base_validation_reward": np.max(adp_val_rews),

        # "Encoder Outputs": wandb.plot.line_series(
        #         xs=wind_vals,
        #         ys=[base_z, adpt_z],
        #         keys=["Base Encoder", "Adaptive Encoder"],
        #         title="Encoder Outputs vs Wind",
        #         xname="Wind Value"
        #     )

        # "max_reward_video": wandb.Video(f"./videos/rl-video{np.argmax(adp_val_rews)}-episode-{np.argmax(adp_val_rews)}.mp4", fps=4, format="mp4")
    })

Initialise the model fo the desired timestamps. Alternatively can specify a checkpoint to continue training.

In [None]:
sweep_id = wandb.sweep(sweep_config, project="lunar")
wandb.agent(sweep_id, function=train_model, count=MAX_RUN_COUNT)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: c16i0n78
Sweep URL: https://wandb.ai/pmsaraiva2712-tum/lunar/sweeps/c16i0n78


[34m[1mwandb[0m: Agent Starting Run: z4tp8pww with config:
[34m[1mwandb[0m: 	actor_lr: 0.06612883447165088
[34m[1mwandb[0m: 	adp_lr: 0.09561611903425288
[34m[1mwandb[0m: 	adp_num_steps: 640
[34m[1mwandb[0m: 	anneal_discount: 500
[34m[1mwandb[0m: 	anneal_lr: False
[34m[1mwandb[0m: 	batches: 816
[34m[1mwandb[0m: 	critic_lr: 0.013570714391609753
[34m[1mwandb[0m: 	gamma: 0.9664221045735084
[34m[1mwandb[0m: 	lam: 0.912244905312102
[34m[1mwandb[0m: 	max_grad_norm: 3.432086547927549
[34m[1mwandb[0m: 	n_updates_per_iteration: 12
[34m[1mwandb[0m: 	num_envs: 82
[34m[1mwandb[0m: 	num_steps: 1500
[34m[1mwandb[0m: Currently logged in as: [33mmohamedrostom[0m ([33mpmsaraiva2712-tum[0m). Use [1m`wandb login --relogin`[0m to force relogin



-------------------- Iteration #1 --------------------
Average Episodic Return: -171.7
Average Actor Loss: -0.20855
Average Critic Loss: 1202.0674167339619
Average KL Divergence: 0.013641167901313075
Iteration took: 2.71 secs, of which rollout took 2.29 secs and gradient updates took 0.42 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -199.24
Average Actor Loss: -0.20985
Average Critic Loss: 902.277753008329
Average KL Divergence: 0.012839652945219245
Iteration took: 2.82 secs, of which rollout took 2.4 secs and gradient updates took 0.42 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Return: -160.12
Average Actor Loss: -0.21144
Average Critic Loss: 737.74773690639

0,1
actor_learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
adp_maximum_base_validation_reward,▁
adp_validation_duration,▁
adp_validation_rewards,▁
average_actor_loss,▁▂▃▄▄▅▅▅▅▆▆▆▆▆▆▆▅▆▆▆▆▆▆▆▆▇▇▇▇▇██████████
average_critic_loss,█▅▅▄▄▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
average_episode_rewards,▁▂▃▃▃▄▄▄▃▄▇▆▇█▆▆▇▆▇▇▆▄▇▇█▇▇▇▇█▇▇▇▇▆▇██▇█
critic_learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
iteration_compute,▁▁▂▂▄▃▃▄▃█▄▇▃▄▄▃▃▂▆▃▂▂▂▂▂▂▂▂▆▂▂▂▂▂▂▂▂▂▂▂
maximum_base_validation_reward,▁

0,1
actor_learning_rate,0.005
adp_maximum_base_validation_reward,115.70015
adp_validation_duration,762.5
adp_validation_rewards,115.70015
average_actor_loss,-0.21213
average_critic_loss,39.53461
average_episode_rewards,173.09468
critic_learning_rate,0.0075
iteration_compute,16.93064
maximum_base_validation_reward,148.94477


[34m[1mwandb[0m: Agent Starting Run: 0xw4f9pi with config:
[34m[1mwandb[0m: 	actor_lr: 0.0628528868914129
[34m[1mwandb[0m: 	adp_lr: 0.02345918585495461
[34m[1mwandb[0m: 	adp_num_steps: 320
[34m[1mwandb[0m: 	anneal_discount: 10
[34m[1mwandb[0m: 	anneal_lr: True
[34m[1mwandb[0m: 	batches: 670
[34m[1mwandb[0m: 	critic_lr: 0.04247580573381598
[34m[1mwandb[0m: 	gamma: 0.9330846124396596
[34m[1mwandb[0m: 	lam: 0.9023386292236067
[34m[1mwandb[0m: 	max_grad_norm: 1.1219318278013384
[34m[1mwandb[0m: 	n_updates_per_iteration: 18
[34m[1mwandb[0m: 	num_envs: 61
[34m[1mwandb[0m: 	num_steps: 600



-------------------- Iteration #1 --------------------
Average Episodic Return: -535.24
Average Actor Loss: -0.21089
Average Critic Loss: 5587.854203913762
Average KL Divergence: 0.007861548659946913
Iteration took: 2.83 secs, of which rollout took 2.43 secs and gradient updates took 0.4 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #2 --------------------
Average Episodic Return: -612.09
Average Actor Loss: -0.2113
Average Critic Loss: 4395.4199697641225
Average KL Divergence: 0.008010701135096426
Iteration took: 2.67 secs, of which rollout took 2.26 secs and gradient updates took 0.4 secs
Current actor learning rate: 0.005
Current critic learning rate: 0.0075
------------------------------------------------------


-------------------- Iteration #3 --------------------
Average Episodic Return: -267.88
Average Actor Loss: -0.21145
Average Critic Loss: 3626.6699775108

In [6]:
api = wandb.Api()

# Fetch the sweep object
sweep = api.sweep(f"pmsaraiva2712-tum/lunar/{sweep_id}")

# Fetch all runs from the sweep
runs = sweep.runs

# Sort runs by the metric you are optimizing for, e.g., 'val_loss'
best_run = sorted(runs, key=lambda run: run.summary.get('val_rewards', float('-inf')), reverse=True)[0]

# Extract best hyperparameters and metrics
best_params = best_run.config
best_metrics = best_run.summary

# Print the best hyperparameters and metrics
print("Best Hyperparameters:", best_params)
print("Best Metrics:", best_metrics)

Exception in thread Exception in threading.excepthook:
Exception ignored in thread started by: <bound method Thread._bootstrap of <Thread(Thread-26 (_run_job), stopped 140260891666112)>>
Traceback (most recent call last):
  File "/usr/lib/python3.11/threading.py", line 995, in _bootstrap
    self._bootstrap_inner()
  File "/usr/lib/python3.11/threading.py", line 1040, in _bootstrap_inner
    self._invoke_excepthook(self)
  File "/usr/lib/python3.11/threading.py", line 1352, in invoke_excepthook
    local_print("Exception in threading.excepthook:",
  File "/home/mohamedrostom62/ADLR/tum-adlr-ws25-16/.venv/lib/python3.11/site-packages/ipykernel/iostream.py", line 604, in flush
    self.pub_thread.schedule(self._flush)
  File "/home/mohamedrostom62/ADLR/tum-adlr-ws25-16/.venv/lib/python3.11/site-packages/ipykernel/iostream.py", line 267, in schedule
    self._event_pipe.send(b"")
  File "/home/mohamedrostom62/ADLR/tum-adlr-ws25-16/.venv/lib/python3.11/site-packages/zmq/sugar/socket.py", l

Best Hyperparameters: {'lam': 0.972831062901546, 'gamma': 0.9762351550544416, 'adp_lr': 0.09927788245027704, 'batches': 332, 'actor_lr': 0.09110289758146364, 'num_envs': 57, 'anneal_lr': False, 'critic_lr': 0.07641949481794698, 'num_steps': 400, 'adp_num_steps': 800, 'max_grad_norm': 0.9648681113993745, 'anneal_discount': 330, 'n_updates_per_iteration': 7}
Best Metrics: {'_runtime': 1381.245859275, '_step': 129, '_timestamp': 1736364925.640216, '_wandb': {'runtime': 1381}, 'actor_learning_rate': 0.005, 'average_actor_loss': -0.2133525514073367, 'average_critic_loss': 87.58824276961218, 'average_episode_rewards': 125.1359634399414, 'critic_learning_rate': 0.0075, 'iteration_compute': 7.647851373, 'simulated_iterations': 119, 'val_durs': 1201, 'val_rews': 124.9922409408134}
