In [1]:
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces

In [42]:
class FloatMatchEnv(gym.Env):
    metadata = {"render_modes": ["console"]}

    def __init__(self, render_mode="console"):
        super(FloatMatchEnv, self).__init__()
        self.render_mode = render_mode

        self.action_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.observation_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
        self.nextFloat = np.random.ranf(1,).astype(np.float32)
        self.memory = pd.DataFrame(columns=["Step", "Previous Observation", "Action", "Reward"])
        self.memory.index.name = "Step"
        self.currentStep = 0
        self.maxSteps = 1000
        self.tolerance = 0.05
    def reset(self, seed=None, options=None):
        """
        Important: the observation must be a numpy array
        :return: (np.array)
        """
        super().reset(seed=seed, options=options)
        self.currentStep = 0
        self.action = 0
        self.nextFloat = np.array(np.random.ranf(1,)).astype(np.float32)
        self.memory = pd.DataFrame(columns=["Previous Observation", "Action", "Reward"])
        self.memory.index.name = "Step"

        observation = self.nextFloat

        return observation, {}  # empty info dict
        
    def step(self, action):
        self.action = action.item()
            # Calculate the absolute error between predicted and target values
        error = abs(self.action - self.nextFloat)
    
        # Check if the predicted value is within the tolerance range
        if error <= self.tolerance:
            reward = 1.0  # Maximum reward for accurate predictions
        else:
            reward = max(0.0, (1.0 - error)**2)
            reward = reward.item()
        self.memory = pd.DataFrame(columns=["Previous Observation", "Action", "Reward"])
        self.memory.loc[self.currentStep] = [self.nextFloat, self.action, reward]
        self.nextFloat = np.random.ranf(1,).astype(np.float32)
        observation = self.nextFloat    
        self.currentStep = self.currentStep + 1       
        terminated = False
        truncated = False
        if self.currentStep > self.maxSteps:
            terminated = True
            

        return (
            observation,
            reward,
            terminated,
            truncated,
            self.memory.to_dict(),
        )
         
    def render(self):
        # agent is represented as a cross, rest as a dot
        if self.render_mode == "console":
           #print(self.date)
            print(self.memory.iloc[-1:].to_string(index=True, header=True))

    def close(self):
        pass

In [33]:
from ray.tune.registry import register_env

register_env("FloatMatchEnv", FloatMatchEnv)

In [45]:
from ray.rllib.algorithms.ppo import PPOConfig


config = (  # 1. Configure the algorithm,
    PPOConfig()
    .environment(FloatMatchEnv)
    .rollouts(num_rollout_workers=2)
    .framework("torch")
    .training()
    .evaluation(evaluation_num_workers=1)
)
#pretty_print(config.to_dict())

algo = config.build()  # 2. build the algorithm,

for _ in range(4):
    result = algo.train()  # 3. train it,

#pretty_print(result)
algo.evaluate()  # 4. and evaluate it.

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2023-11-12 15:27:07,323	INFO trainable.py:164 -- Trainable.setup took 18.433 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


{'evaluation': {'sampler_results': {'episode_reward_max': 532.2578395393148,
   'episode_reward_min': 508.26897799695143,
   'episode_reward_mean': 519.0193392155858,
   'episode_len_mean': 1001.0,
   'episode_media': {},
   'episodes_this_iter': 10,
   'policy_reward_min': {},
   'policy_reward_max': {},
   'policy_reward_mean': {},
   'custom_metrics': {},
   'hist_stats': {'episode_reward': [532.2578395393148,
     514.1613608431358,
     522.1899891095527,
     513.2495196799509,
     524.7666362208038,
     519.5802247530846,
     508.26897799695143,
     514.006536446941,
     518.8330887061456,
     522.8792188599764],
    'episode_lengths': [1001,
     1001,
     1001,
     1001,
     1001,
     1001,
     1001,
     1001,
     1001,
     1001]},
   'sampler_perf': {'mean_raw_obs_processing_ms': 0.55759815935869,
    'mean_inference_ms': 1.5768755709966595,
    'mean_action_processing_ms': 0.20032281011578182,
    'mean_env_wait_ms': 1.4055596163860868,
    'mean_env_render_ms'

In [46]:
obs, _ = env.reset()
env.render()

n_steps = 10
for step in range(n_steps):
    action = algo.compute_single_action(obs)
    obs, reward, terminated, truncated, info = env.step(action)
    log = info
    done = terminated or truncated
    env.render()


Empty DataFrame
Columns: [Previous Observation, Action, Reward]
Index: []
  Previous Observation    Action  Reward
0         [0.80681854]  0.815053     1.0
  Previous Observation   Action   Reward
1        [0.013449535]  0.71724  0.08774
  Previous Observation    Action    Reward
2           [0.779399]  0.674487  0.801182
  Previous Observation    Action  Reward
3         [0.11691102]  0.128062     1.0
  Previous Observation   Action    Reward
4          [0.9139152]  0.98967  0.854229
  Previous Observation  Action    Reward
5         [0.28003678]     1.0  0.078421
  Previous Observation  Action    Reward
6         [0.91142297]     1.0  0.830692
  Previous Observation    Action    Reward
7         [0.35351092]  0.299932  0.895713
  Previous Observation    Action   Reward
8          [0.9563749]  0.448056  0.24175
  Previous Observation  Action    Reward
9          [0.9498008]     1.0  0.902122
