In [1]:
# import torch
# print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

In [2]:
import gymnasium as gym
import numpy as np
from sb3_contrib import MaskablePPO
from sb3_contrib.common.maskable.evaluation import evaluate_policy
import torch
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_checker import check_env
from environment3 import LifeStyleEnv
from sb3_contrib.common.wrappers import ActionMasker


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


def mask_fn(env: gym.Env) -> np.ndarray:
    unwrapped_env = env
    while hasattr(unwrapped_env, "env"):
        unwrapped_env = unwrapped_env.env
    return unwrapped_env.action_masks()

def make_env():
    env = LifeStyleEnv()
    env = Monitor(env)
    check_env(env, warn=True) 
    return env 

lr = 0.003
ent_coefs = [0.0, 0.001, 0.01]
results = []

env = make_env()
env = ActionMasker(env, mask_fn)

eval_env = make_env()
eval_env = ActionMasker(eval_env, mask_fn)

for ent_coef in ent_coefs:
    print(f"\nTraining with ent_coef={ent_coef}")

    model = MaskablePPO(
        "MultiInputPolicy",
        env,
        learning_rate=lr,
        n_steps=2304,
        batch_size=512,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=ent_coef,
        vf_coef=0.5,
        max_grad_norm=0.5,
        verbose=1,
        device=device,
        policy_kwargs=dict(net_arch=[128, 128])
    )

    model.learn(total_timesteps=250000)

    mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=20)
    results.append((ent_coef, mean_reward, std_reward))

highest = 0
best_entropy_coef = 0.01

print("\n=== Final Results ===")
for ent_coef, mean_reward, std_reward in results:
    print(f"Entropy: {ent_coef} | Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

    if mean_reward > highest:
        highest = mean_reward
        best_entropy_coef = ent_coef

print(f"The best entropy coefficient: {best_entropy_coef}")


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using device: cuda

Training with ent_coef=0.0
Using cuda device
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 2.3e+03   |
|    ep_rew_mean     | -3.13e+03 |
| time/              |           |
|    fps             | 657       |
|    iterations      | 1         |
|    time_elapsed    | 3         |
|    total_timesteps | 2304      |
----------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 2.3e+03      |
|    ep_rew_mean          | -2.06e+03    |
| time/                   |              |
|    fps                  | 658          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4608         |
| train/                  |              |
|    approx_kl            | 0.0075111436 |
|    clip_fraction        | 0.118        |
|    clip_range           | 0.2   

In [1]:
from sb3_contrib import MaskablePPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
import numpy as np
import torch
from environment3 import LifeStyleEnv
import gymnasium as gym
from sb3_contrib.common.wrappers import ActionMasker

best_entropy_coef = 0.001

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def mask_fn(env: gym.Env) -> np.ndarray:
    unwrapped_env = env
    while hasattr(unwrapped_env, "env"):
        unwrapped_env = unwrapped_env.env
    return unwrapped_env.action_masks()

def make_env(is_eval: bool = False):
    env = LifeStyleEnv()
    env = Monitor(env)
    if not is_eval:
        check_env(env, warn=True) 
    return env

env = make_env()
env = ActionMasker(env, mask_fn)

eval_env = make_env(is_eval=True)
eval_env = ActionMasker(eval_env, mask_fn)

eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/best_model",
    log_path="./logs/results",
    eval_freq=5000,
    n_eval_episodes=10,
    deterministic=True,
    render=False
)

model = MaskablePPO(
    "MultiInputPolicy",  
    env,
    learning_rate=0.003,
    n_steps=2304,
    batch_size=512,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=best_entropy_coef,  
    vf_coef=0.5,
    max_grad_norm=0.5,
    verbose=1,
    device=device,
    policy_kwargs=dict(net_arch=[128, 128])
)

model.learn(
    total_timesteps=800000, 
    callback=eval_callback
)

model.save("../agent/ppo_lifestylecoach_best_entropy2")

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using device: cuda
Using cuda device
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 2.3e+03   |
|    ep_rew_mean     | -4.27e+03 |
| time/              |           |
|    fps             | 682       |
|    iterations      | 1         |
|    time_elapsed    | 3         |
|    total_timesteps | 2304      |
----------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2.3e+03     |
|    ep_rew_mean          | -1.89e+03   |
| time/                   |             |
|    fps                  | 682         |
|    iterations           | 2           |
|    time_elapsed         | 6           |
|    total_timesteps      | 4608        |
| train/                  |             |
|    approx_kl            | 0.009501146 |
|    clip_fraction        | 0.145       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.7

In [2]:
from environment3 import LifeStyleEnv
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.monitor import Monitor
from sb3_contrib.common.maskable.utils import get_action_masks
import numpy as np
from sb3_contrib import MaskablePPO

def make_env(is_eval: bool = False):
    env = LifeStyleEnv()
    env = Monitor(env)
    if not is_eval:
        check_env(env, warn=True)
    return env

eval_env = make_env(is_eval=True)


model = MaskablePPO.load("../agent/ppo_lifestylecoach_best_entropy2.zip")

print("Starting Final Evaluation...")
print("-------------------------------------------------------------------------------------------------------------------------------------")
print(f"| {'Day':<3} | {'Timeslot':<10} | {'Action':<10} | {'Event':<10} | {'BMI':<8} | {'Stress':<8} | {'Energy':<8} | {'Hunger':<8} | {'Cal. Intake':<12} | {'Cal. Burned':<12} | {'Reward':<8} |")
print("-------------------------------------------------------------------------------------------------------------------------------------")

episode_rewards = []
for episode in range(1):  
    obs, info = eval_env.reset()
    unwrapped_env = eval_env.unwrapped
    
    done = False
    total_reward = 0
    while not done:
        action_masks = get_action_masks(unwrapped_env)
        action, _ = model.predict(obs, deterministic=True, action_masks=action_masks)
        obs, reward, terminated, truncated, info = eval_env.step(action)
        done = terminated or truncated
        total_reward += reward
        
        timeslot_applied = unwrapped_env.state['current_timeslot'] - 1
        timeslot_applied = max(timeslot_applied, 0)  
        event_applied = unwrapped_env.daily_schedule[timeslot_applied]

        print(
            f"| {unwrapped_env.state['day_of_episode']:<3} | "
            f"{unwrapped_env.state['current_timeslot']:<10} | "
            f"{action:<10} | "
            f"{event_applied:<10} | "
            f"{unwrapped_env.state['current_bmi']:<8.2f} | "
            f"{unwrapped_env.state['current_stress_level']:<8.2f} | "
            f"{unwrapped_env.state['current_energy_level']:<8.2f} | "
            f"{unwrapped_env.state['current_hunger_level']:<8.2f} | "
            f"{unwrapped_env.state['daily_calories_intake']:<12.2f} | "
            f"{unwrapped_env.state['daily_calories_burned']:<12.2f} | "
            f"{reward:<8.2f} |"
        )
        
        episode_rewards.append(reward)

print("-------------------------------------------------------------------------------------------------------------------------------------")
print("Mean evaluation reward:", np.mean(episode_rewards))
print("Std deviation:", np.std(episode_rewards))

Starting Final Evaluation...
-------------------------------------------------------------------------------------------------------------------------------------
| Day | Timeslot   | Action     | Event      | BMI      | Stress   | Energy   | Hunger   | Cal. Intake  | Cal. Burned  | Reward   |
-------------------------------------------------------------------------------------------------------------------------------------
| 0   | 1          | 8          | sleep      | 24.22    | 46.00    | 54.00    | 48.00    | 0.00         | 66.15        | 0.42     |
| 0   | 2          | 8          | sleep      | 24.22    | 42.00    | 58.00    | 46.00    | 0.00         | 132.30       | 1.04     |
| 0   | 3          | 8          | sleep      | 24.22    | 38.00    | 62.00    | 44.00    | 0.00         | 198.45       | 1.63     |
| 0   | 4          | 8          | sleep      | 24.22    | 34.00    | 66.00    | 42.00    | 0.00         | 264.60       | 2.20     |
| 0   | 5          | 8          | sleep    