# **Dependencies**

In [None]:
import os
import glob
import tqdm

import gymnasium as gym
from gymnasium.wrappers import RecordVideo
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import (
    CheckpointCallback,
    EvalCallback,
    StopTrainingOnRewardThreshold,
    CallbackList
)
from pathlib import Path

# **Load Environment**

In [None]:
environment_name = "BipedalWalker-v3"

train_env = SubprocVecEnv([
    lambda: gym.make(environment_name, 
        hardcore = True,
        render_mode=None)
    for _ in range(12)
])

test_env = gym.make(environment_name, 
        hardcore = True,
        render_mode = None)

# **Training**

In [None]:
save_path = Path(r"C:\Users\KIIT\OneDrive\Desktop\VS CODE\Reinforcement\Bipedal\saved_models")
checkpoint_path = Path(r"C:\Users\KIIT\OneDrive\Desktop\VS CODE\Reinforcement\Bipedal\checkpoints")

In [None]:
stop_callback = StopTrainingOnRewardThreshold(
    reward_threshold=500,
    verbose=1
)

eval_callback = EvalCallback(
    test_env,
    callback_on_new_best=stop_callback,
    eval_freq=5000,
    best_model_save_path=save_path,
    verbose=1
)

checkpoint_callback = CheckpointCallback(
    save_freq=2048,          # every 10k steps
    save_path=checkpoint_path,
    name_prefix="lunar_ppo",
    verbose=2
)

callback = CallbackList([checkpoint_callback, eval_callback])

In [None]:
model = PPO(
    "MlpPolicy", 
    train_env,
    verbose=1,
    
    # --- The 12-Core Optimizations ---
    n_steps=2048,       # Collect less data before learning (12 * 1024 = 12k steps)
    batch_size=128,     # Process larger chunks of data at once (faster on your laptop)
    n_epochs=4,         # Don't over-train on the same data (prevents overfitting)
    
    # --- The Standard Good Stuff ---
    learning_rate=3e-4,
    gamma=0.99,        # Increased to 0.999 (LunarLander needs long-term planning)
    gae_lambda=0.95,
    ent_coef=0.001, 
    vf_coef=0.5,
    max_grad_norm=0.5,
    policy_kwargs=dict(net_arch=[dict(pi=[256, 256], vf=[256, 256])])
)

In [None]:
model.learn(total_timesteps=10000000, callback=callback , progress_bar= True)

In [None]:
model.save(save_path / "saved_model_hardcore.zip")

# **Loading**

In [None]:
import os
path = r"C:\Users\KIIT\OneDrive\Desktop\VS CODE\Reinforcement\Bipedal\ssaved_models"  # example
if os.path.exists(path):
    print("âœ… File exists")
else:
    print(" File not found")

In [None]:
# Test env model load

model = PPO.load(r"C:\Users\KIIT\OneDrive\Desktop\VS CODE\Reinforcement\Bipedal\saved_models\saved_model_normal.zip" , env = test_env)

In [None]:
# Train env model load

model = PPO.load(r"C:\Users\KIIT\OneDrive\Desktop\VS CODE\Reinforcement\Bipedal\saved_models\saved_model_hardcore.zip" , env = train_env)

# **Testing**

In [None]:
episodes = 5

for episode in range(1, episodes + 1):
    obs, info = test_env.reset()
    terminated = False
    truncated = False
    score = 0

    while not (terminated or truncated):
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = test_env.step(action)
        score += reward

    print(f"Episode {episode} - Score: {score}")

In [None]:
test_env.close() 
train_env.close()