In [None]:
# import torch
# print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

In [1]:
from environment3 import LifeStyleEnv
from sb3_contrib import MaskablePPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def make_env():
    def _init():
        env = LifeStyleEnv()
        env = Monitor(env)
        # It's good practice to run check_env here
        check_env(env, warn=True) 
        return env
    return _init

train_env = DummyVecEnv([make_env()])
train_env = VecNormalize(
    train_env,
    norm_obs=False,      
    norm_reward=False,    
    clip_reward=10.0     
)

lr = 0.003
ent_coefs = [0.0, 0.001, 0.005, 0.01]
results = []

for ent_coef in ent_coefs:
    print(f"\nTraining with ent_coef={ent_coef}")

    # Change: Use MaskablePPO instead of PPO
    model = MaskablePPO(
        "MultiInputPolicy",
        train_env,
        learning_rate=lr,
        n_steps=2048,
        batch_size=512,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=ent_coef,
        vf_coef=0.5,
        max_grad_norm=0.5,
        verbose=1,
        device=device,
        policy_kwargs=dict(net_arch=[128, 128])
    )

    model.learn(total_timesteps=500000)

    # Note: evaluate_policy also needs to be compatible with MaskablePPO,
    # which it generally is as long as the env provides `action_masks`.
    mean_reward, std_reward = evaluate_policy(model, train_env, n_eval_episodes=20)
    results.append((ent_coef, mean_reward, std_reward))

print("\n=== Final Results ===")
for ent_coef, mean_reward, std_reward in results:
    print(f"Entropy: {ent_coef} | Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


Using device: cuda

Training with ent_coef=0.0
Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 672      |
|    ep_rew_mean     | 407      |
| time/              |          |
|    fps             | 621      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 672        |
|    ep_rew_mean          | 418        |
| time/                   |            |
|    fps                  | 619        |
|    iterations           | 2          |
|    time_elapsed         | 6          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.01240546 |
|    clip_fraction        | 0.197      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.768     |
|    explained_varian

KeyboardInterrupt: 

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.env_checker import check_env
import numpy as np

env = LifeStyleEnv()
check_env(env)  

eval_env = LifeStyleEnv()
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/best_model",
    log_path="./logs/results",
    eval_freq=5000,
    n_eval_episodes=10,
    deterministic=True,
    render=False
)

best_entropy_coef = 0.01

model = PPO(
    "MultiInputPolicy",  
    env,
    learning_rate=3e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    ent_coef=best_entropy_coef,  
    verbose=1
)

model.learn(
    total_timesteps=1000000, 
    callback=eval_callback
)

model.save("../agent/ppo_lifestylecoach_best_entropy")

obs = eval_env.reset()[0]
episode_rewards = []
for _ in range(10):  
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = eval_env.step(action)
        done = terminated or truncated
        total_reward += reward
    episode_rewards.append(total_reward)
    obs = eval_env.reset()[0]

print("Mean evaluation reward:", np.mean(episode_rewards))
print("Std deviation:", np.std(episode_rewards))
