In [1]:
# import torch
# print(torch.cuda.is_available())
# print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

In [None]:
from environment2 import LifeStyleCoachEnv
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def make_env():
    def _init():
        env = LifeStyleCoachEnv()
        env = Monitor(env)
        check_env(env)
        return env
    return _init

train_env = DummyVecEnv([make_env()])
train_env = VecNormalize(
    train_env,
    norm_obs=False,      
    norm_reward=False,    
    clip_reward=10.0     
)

lr = 0.003
ent_coefs = [0.0, 0.01, 0.05]
results = []

for ent_coef in ent_coefs:
    print(f"\nTraining with ent_coef={ent_coef}")

    model = PPO(
        "MultiInputPolicy",
        train_env,
        learning_rate=lr,
        n_steps=2048,
        batch_size=512,
        n_epochs=10,
        gamma=0.99,
        gae_lambda=0.95,
        clip_range=0.2,
        ent_coef=ent_coef,
        vf_coef=0.5,
        max_grad_norm=0.5,
        verbose=1,
        device=device,
        policy_kwargs=dict(net_arch=[128, 128])
    )

    model.learn(total_timesteps=500000)

    mean_reward, std_reward = evaluate_policy(model, train_env, n_eval_episodes=20)
    results.append((ent_coef, mean_reward, std_reward))

print("\n=== Final Results ===")
for ent_coef, mean_reward, std_reward in results:
    print(f"Entropy: {ent_coef} | Mean reward: {mean_reward:.2f} ± {std_reward:.2f}")
