In [3]:
!pip install stable-baselines3[extra]

Collecting stable-baselines3[extra]
  Downloading stable_baselines3-2.7.0-py3-none-any.whl.metadata (4.8 kB)
Downloading stable_baselines3-2.7.0-py3-none-any.whl (187 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m187.2/187.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stable-baselines3
Successfully installed stable-baselines3-2.7.0


# 1. Import Dependencies

In [39]:
import gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete
import numpy as np
import random
import os
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy


# 2. Types of Spaces

In [16]:
Discrete(3)

Discrete(3)

In [17]:
Box(0,1,shape=(3,3)).sample()

array([[0.83965856, 0.6592287 , 0.14932995],
       [0.7289243 , 0.776102  , 0.9638949 ],
       [0.28569618, 0.82222426, 0.00875576]], dtype=float32)

In [18]:
Box(0,255,shape=(3,3), dtype=int).sample()

array([[  2, 174, 193],
       [ 30,  89, 147],
       [177,  66,  86]])

In [19]:
Tuple((Discrete(2), Box(0,100, shape=(1,)))).sample()

(1, array([25.368006], dtype=float32))

In [20]:
Dict({'height':Discrete(2), "speed":Box(0,100, shape=(1,))}).sample()

OrderedDict([('height', 0), ('speed', array([81.14266], dtype=float32))])

In [21]:
MultiBinary(4).sample()

array([1, 1, 0, 0], dtype=int8)

In [22]:
MultiDiscrete([5,2,2]).sample()

array([4, 0, 1])

# 3. Building an Environment

In [37]:
class CoffeeBrewingEnv(Env): # Inherit from gymnasium.Env
    def __init__(self):
        # Actions: 0=decrease temp, 1=maintain, 2=increase temp, 3=decrease grounds, 4=increase grounds
        self.action_space = Discrete(5)
        # Observation: coffee strength (0-100), water temperature (80-100°C)
        self.observation_space = Box(low=np.array([0.0, 80.0]), high=np.array([100.0, 100.0]), dtype=np.float32)
        # Initial state: coffee strength and temperature
        self.state = np.array([50.0 + random.uniform(-10, 10), 90.0 + random.uniform(-5, 5)], dtype=np.float32)
        # Brewing time (seconds)
        self.brew_length = 120

    def step(self, action):
        # Apply action
        strength, temp = self.state
        if action == 0:  # Decrease temp
            temp -= 1.0
        elif action == 2:  # Increase temp
            temp += 1.0
        elif action == 3:  # Decrease grounds
            strength -= 2.0
        elif action == 4:  # Increase grounds
            strength += 2.0
        # Action 1 (maintain) does nothing

        # Update strength based on temperature
        strength += (temp - 90.0) * 0.1  # Strength adjusts slightly based on temp deviation
        strength = np.clip(strength, 0, 100)
        temp = np.clip(temp, 80, 100)
        self.state = np.array([strength, temp], dtype=np.float32)

        # Reduce brewing time
        self.brew_length -= 1

        # Calculate reward: ideal strength is 60-70, ideal temp is 88-92°C
        strength_reward = 2.0 if 60 <= strength <= 70 else -1.0
        temp_reward = 1.0 if 88 <= temp <= 92 else -0.5
        reward = strength_reward + temp_reward

        # Check if brewing is done
        terminated = self.brew_length <= 0
        truncated = False

        # Add slight noise to simulate brewing variability
        self.state += np.array([random.uniform(-0.5, 0.5), random.uniform(-0.2, 0.2)], dtype=np.float32)
        self.state = np.clip(self.state, [0.0, 80.0], [100.0, 100.0]).astype(np.float32)

        info = {}
        return self.state, reward, terminated, truncated, info

    def render(self):
        print(f"Coffee Strength: {self.state[0]:.1f}, Water Temp: {self.state[1]:.1f}°C, Time Left: {self.brew_length}s")

    def reset(self, seed=None, options=None):
        super().reset(seed=seed) # Handle the seed
        # Reset to initial conditions
        self.state = np.array([50.0 + random.uniform(-10, 10), 90.0 + random.uniform(-5, 5)], dtype=np.float32)
        self.brew_length = 120
        return self.state, {}

In [40]:
env = CoffeeBrewingEnv()

  gym.logger.warn(
  gym.logger.warn(


In [41]:
env.observation_space.sample()

array([60.67132, 80.79715], dtype=float32)

In [42]:
env.reset()

(array([47.23456, 88.04394], dtype=float32), {})

In [43]:
check_env(env, warn=True)

# 4. Test Environment

In [46]:
episodes = 50
for episode in range(1, episodes + 1):
    state, info = env.reset() # reset now returns state and info
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action) # unpack 5 values
        score += reward
        done = terminated # Use 'terminated' for the loop condition
    print(f'Episode: {episode} Score: {score}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Coffee Strength: 0.0, Water Temp: 82.9°C, Time Left: 38s
Coffee Strength: 0.4, Water Temp: 81.9°C, Time Left: 37s
Coffee Strength: 1.5, Water Temp: 81.8°C, Time Left: 36s
Coffee Strength: 2.3, Water Temp: 81.6°C, Time Left: 35s
Coffee Strength: 2.0, Water Temp: 82.6°C, Time Left: 34s
Coffee Strength: 1.2, Water Temp: 81.4°C, Time Left: 33s
Coffee Strength: 0.2, Water Temp: 82.3°C, Time Left: 32s
Coffee Strength: 1.0, Water Temp: 82.4°C, Time Left: 31s
Coffee Strength: 2.5, Water Temp: 82.5°C, Time Left: 30s
Coffee Strength: 2.1, Water Temp: 81.4°C, Time Left: 29s
Coffee Strength: 0.8, Water Temp: 81.3°C, Time Left: 28s
Coffee Strength: 0.3, Water Temp: 81.2°C, Time Left: 27s
Coffee Strength: 0.2, Water Temp: 80.1°C, Time Left: 26s
Coffee Strength: 1.1, Water Temp: 80.0°C, Time Left: 25s
Coffee Strength: 0.3, Water Temp: 80.0°C, Time Left: 24s
Coffee Strength: 0.0, Water Temp: 80.1°C, Time Left: 23s
Coffee Strength: 0.0, W

# 5. Train

In [None]:
log_path = os.path.join('Training', 'Logs')
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=400000)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training/Logs/PPO_1


  return datetime.utcnow().replace(tzinfo=utc)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 120      |
|    ep_rew_mean     | -59.1    |
| time/              |          |
|    fps             | 1131     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 120         |
|    ep_rew_mean          | -60.3       |
| time/                   |             |
|    fps                  | 680         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011084453 |
|    clip_fraction        | 0.0803      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.6        |
|    explained_variance   | 0.00135     |
|    learning_rate        | 0.

# 6. Save

In [None]:
model.save('PPO_Coffee')
evaluate_policy(model, env, n_eval_episodes=10, render=True)
env.close()