In [1]:
import gymnasium as gym
from src.reinforce.reinforce_agent import ReinforceAgent
from src.reinforce.reinforce_trainer import ReinforceTrainer
from src.util.plotter import learning_rate_ma
import numpy as np
import matplotlib.pyplot as plt
import os


In [2]:
##########################################
# Reinforce training - walker 2d         #
##########################################

################################
# Hyperparameters - walker2D
################################
EPOCHS_WALKER = 100000   # episodes
HIDDEN_LYR_1_WALKER = 64
HIDDEN_LYR_2_WALKER = 64
LR_WALKER = 0.0001
GAMMA_WALKER = 0.99    # discount factor on future steps

In [3]:
def train_reinforce(epochs: int,
                    layer_1: int,
                    layer_2: int,
                    lr: float,
                    discount: float,
                    exp_name: str) -> list:

    sim_env = gym.make(exp_name)
    obs_dim = sim_env.observation_space.shape[0]
    action_dim = sim_env.action_space.shape[0]

    reinforce_agent = ReinforceAgent(
        obs_dim, action_dim, layer_1, layer_2, lr, discount)

    trainer = ReinforceTrainer(sim_env, reinforce_agent, epochs)

    return trainer.train()


In [4]:
walker_returns = train_reinforce(EPOCHS_WALKER,
                                 HIDDEN_LYR_1_WALKER,
                                 HIDDEN_LYR_2_WALKER,
                                 LR_WALKER, GAMMA_WALKER,
                                 "Walker2d-v4")


  logger.deprecation(
2025-Jul-24 21:14:43,221:reinforce_trainer:train:INFO: 
                        
=== Episode 1 ===
                          Mean reward from last 100 returns: -2.294036153278073
                    
2025-Jul-24 21:14:43,221:reinforce_trainer:show_policy:INFO: Recording episode
  logger.deprecation(


MoviePy - Building file /Users/danralley/projects/walking_with_DRLs/recordings/reinforce-2025-07-24_21_14_44.gif with imageio.


2025-Jul-24 21:14:45,796:reinforce_trainer:train:INFO:       
                        
=== Episode 101 ===
                          Mean reward from last 100 returns: 6.656291233075343
                    


KeyboardInterrupt: 

In [None]:
x = np.arange(0, len(walker_returns), 1)
y = walker_returns

learning_rate_ma(x, np.array(
    y), title=f"Reinforce Learning Curve, layers: {HIDDEN_LYR_1_WALKER}, {HIDDEN_LYR_2_WALKER}")
