In [2]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np


import time

class OneMoleculeEnv(gym.Env):
    def __init__(self, initial_value=10, molecule_lifetime = 1, dt = 0.1, max_steps=100, history_length=5, target_value=10, obs_cap=100, render_mode=None):
        super(OneMoleculeEnv, self).__init__()

        self.initial_value = initial_value
        self.molecule_lifetime = molecule_lifetime
        self.dt = dt
        self.max_steps = max_steps
        self.history_length = history_length
        self.target_value = target_value
        self.obs_cap = obs_cap  # Cap for the observation space
        self.render_mode = render_mode

        self.current_value = initial_value
        self.current_step = 0

        # Define action and observation space
        self.action_space = spaces.Discrete(obs_cap)  # the number of molecules to send in ranges from 0 to the cap
        self.observation_space = spaces.Box(
            low=0, high=obs_cap, shape=(history_length,), dtype=np.float32
        )

        # Initialize the history of values
        self.history = np.full(history_length, initial_value, dtype=np.float32)

        # Pre-generate random numbers to avoid generating them at each step
        self.random_numbers = np.random.rand(max_steps)

    def reset(self, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)
            self.random_numbers = np.random.rand(self.max_steps)  # Re-generate random numbers if seeded
        self.current_value = self.initial_value
        self.current_step = 0
        self.history = np.full(self.history_length, self.initial_value, dtype=np.float32)
        return self.history, {}

    def _ensure_random_numbers(self):
        # Reset the random numbers if current_step exceeds max_steps
        if self.current_step >= self.max_steps:
            self.random_numbers = np.random.rand(self.max_steps)
            self.current_step = 0  # Reset current step to start fresh

    def step(self, action):
        # Apply action
        #if action == 1:
        self.current_value += action
        # Ensure enough random numbers are available
        self._ensure_random_numbers()

        # Decrease the value with a number drawn from a binomial distribution
        # This reflects each molecule having a finite probability of decaying in the timestep
        # The probability a molecule has decayed in the time interval given is
        self.prob_death = np.exp(-dt/molecule_lifetime)
        
        self.current_value += -np.random.binomial(self.current_value, self.p_death, 1)

        # Cap the current value to obs_cap
        self.current_value = min(self.current_value, self.obs_cap)

        # Update the history using a simple list rotation
        self.history[:-1] = self.history[1:]
        self.history[-1] = np.float32(self.current_value)  # Ensure it's a scalar

        # Increment step count
        self.current_step += 1

        # Check if done
        done = self.current_step >= self.max_steps

        # Calculate reward
        reward = -float((self.current_value - self.target_value) ** 2)  # Ensure the reward is a float

        return self.history, reward, done, done, {}

    def render(self):
        if self.render_mode == 'human':
            print(f"Step: {self.current_step}, Value: {self.current_value}, History: {self.history}")

    def close(self):
        pass

# Register the environment
gym.envs.registration.register(
    id='OneMoleculeEnv-v0',
    entry_point=OneMoleculeEnv,
    max_episode_steps=1000000,
)

env = gym.make('OneMoleculeEnv-v0', initial_value=10, max_steps=100000, history_length=5, target_value=15, obs_cap=100)


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")
