## 1. Setup and Imports

In [None]:
import sys
import os
from pathlib import Path
from datetime import datetime

# Add project root to path
project_root = Path(os.getcwd()).parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

In [None]:
import numpy as np
import torch
import imageio
from IPython.display import Video, display, HTML
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

from src.envs.pick_place import PickPlaceEnv

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

## 2. Test Environment (Sanity Check)

In [None]:
# Create environment and verify it works
env = PickPlaceEnv(
    render_mode="rgb_array",
    max_episode_steps=400,
    place_target=(0.35, 0.10),
    randomize_cube=False,
    randomize_target=False,
)

obs, info = env.reset()
print(f"Observation shape: {obs.shape}")
print(f"Action space: {env.action_space}")
print(f"Cube position: {info['cube_pos']}")
print(f"Target position: {info['place_target']}")

# Take a random action
action = env.action_space.sample()
obs, reward, terminated, truncated, info = env.step(action)
print(f"\nRandom action reward: {reward:.3f}")

# Render a frame
frame = env.render()
print(f"Frame shape: {frame.shape}")
env.close()

## 3. Training Configuration

In [None]:
# Training hyperparameters
CONFIG = {
    # Training
    "total_timesteps": 1_000_000,  # Reduce for quick test, increase to 2M for full training
    "eval_freq": 10_000,
    "save_freq": 50_000,
    "seed": 42,
    
    # SAC hyperparameters
    "learning_rate": 3e-4,
    "buffer_size": 100_000,
    "learning_starts": 1_000,
    "batch_size": 256,
    "tau": 0.005,
    "gamma": 0.99,
    "train_freq": 1,
    "gradient_steps": 1,
    
    # Environment
    "max_episode_steps": 400,
    "action_scale": 0.02,
    "lift_height": 0.08,
    "reward_version": "v21",  # v21 has improved grasp incentives (RECOMMENDED)
    "curriculum_stage": 3,
    "place_target": (0.35, 0.10),
}

# Output directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = project_root / "runs" / "pick_place_notebook" / timestamp
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {output_dir}")

## 4. Create Training Environment

In [None]:
def make_env():
    return PickPlaceEnv(
        render_mode=None,
        max_episode_steps=CONFIG["max_episode_steps"],
        action_scale=CONFIG["action_scale"],
        lift_height=CONFIG["lift_height"],
        reward_version=CONFIG["reward_version"],
        curriculum_stage=CONFIG["curriculum_stage"],
        place_target=CONFIG["place_target"],
        randomize_cube=True,
        randomize_target=True,
    )

# Training environment with normalization
train_env = DummyVecEnv([make_env])
train_env = VecNormalize(train_env, norm_obs=True, norm_reward=True)

# Evaluation environment
eval_env = DummyVecEnv([make_env])
eval_env = VecNormalize(eval_env, norm_obs=True, norm_reward=False, training=False)

print("Environments created!")

## 5. Create SAC Model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = SAC(
    "MlpPolicy",
    train_env,
    learning_rate=CONFIG["learning_rate"],
    buffer_size=CONFIG["buffer_size"],
    learning_starts=CONFIG["learning_starts"],
    batch_size=CONFIG["batch_size"],
    tau=CONFIG["tau"],
    gamma=CONFIG["gamma"],
    train_freq=CONFIG["train_freq"],
    gradient_steps=CONFIG["gradient_steps"],
    verbose=1,
    seed=CONFIG["seed"],
    device=device,
    tensorboard_log=str(output_dir / "tensorboard"),
)

print(f"Model created on device: {device}")
print(f"Policy: {model.policy}")

## 6. Setup Callbacks

In [None]:
# Checkpoint callback - save model periodically
checkpoint_callback = CheckpointCallback(
    save_freq=CONFIG["save_freq"],
    save_path=str(output_dir / "checkpoints"),
    name_prefix="sac_pick_place",
)

# Evaluation callback - evaluate and save best model
eval_callback = EvalCallback(
    eval_env,
    best_model_save_path=str(output_dir / "best_model"),
    log_path=str(output_dir / "eval_logs"),
    eval_freq=CONFIG["eval_freq"],
    deterministic=True,
    render=False,
)

print("Callbacks configured!")

## 7. Train the Agent

⚠️ **Note**: Training will take some time depending on `total_timesteps`:
- 100K steps: ~5-10 minutes
- 500K steps: ~30-60 minutes  
- 2M steps: ~2-4 hours (GPU) / ~8-12 hours (CPU)

In [None]:
print(f"Starting training for {CONFIG['total_timesteps']:,} timesteps...")
print(f"Checkpoints will be saved to: {output_dir / 'checkpoints'}")
print(f"Best model will be saved to: {output_dir / 'best_model'}")
print()

model.learn(
    total_timesteps=CONFIG["total_timesteps"],
    callback=[checkpoint_callback, eval_callback],
    progress_bar=True,
)

# Save final model and normalization stats
model.save(output_dir / "final_model")
train_env.save(output_dir / "vec_normalize.pkl")

print(f"\n✅ Training complete!")
print(f"Final model saved to: {output_dir / 'final_model.zip'}")

## 8. Load Best Model for Evaluation

In [None]:
# Load the best model
best_model_path = output_dir / "best_model" / "best_model.zip"
if best_model_path.exists():
    eval_model = SAC.load(best_model_path, device=device)
    print(f"Loaded best model from: {best_model_path}")
else:
    eval_model = model
    print("Using final model for evaluation")

# Load normalization stats
vec_normalize_path = output_dir / "vec_normalize.pkl"
if vec_normalize_path.exists():
    print(f"Normalization stats available at: {vec_normalize_path}")

## 9. Evaluate and Record Video

In [None]:
def evaluate_and_record(model, num_episodes=3):
    """Evaluate model and record videos."""
    
    # Create environment with rendering
    env = PickPlaceEnv(
        render_mode="rgb_array",
        max_episode_steps=CONFIG["max_episode_steps"],
        place_target=CONFIG["place_target"],
        randomize_cube=False,
        randomize_target=False,
    )
    
    # Load normalization if available
    vec_env = DummyVecEnv([lambda: env])
    if vec_normalize_path.exists():
        vec_env = VecNormalize.load(vec_normalize_path, vec_env)
        vec_env.training = False
        vec_env.norm_reward = False
    
    all_frames = []
    results = []
    
    for ep in range(num_episodes):
        obs = vec_env.reset()
        frames = []
        total_reward = 0
        done = False
        step = 0
        
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = vec_env.step(action)
            total_reward += reward[0]
            step += 1
            
            # Render frame
            frame = env.render()
            if frame is not None:
                frames.append(frame)
        
        success = info[0].get("is_success", False)
        results.append({"episode": ep + 1, "reward": total_reward, "steps": step, "success": success})
        print(f"Episode {ep + 1}: Reward={total_reward:.2f}, Steps={step}, Success={success}")
        
        all_frames.extend(frames)
    
    env.close()
    
    return all_frames, results

print("Recording evaluation episodes...")
frames, results = evaluate_and_record(eval_model, num_episodes=3)

In [None]:
# Print summary
print("\n" + "="*50)
print("EVALUATION SUMMARY")
print("="*50)
for r in results:
    status = "✅" if r["success"] else "❌"
    print(f"Episode {r['episode']}: {status} Reward={r['reward']:.2f}, Steps={r['steps']}")

success_rate = sum(1 for r in results if r["success"]) / len(results) * 100
avg_reward = np.mean([r["reward"] for r in results])
print(f"\nSuccess Rate: {success_rate:.1f}%")
print(f"Average Reward: {avg_reward:.2f}")

## 10. Save and Display Video

In [None]:
# Save video
video_path = output_dir / "evaluation_video.mp4"
if frames:
    imageio.mimsave(str(video_path), frames, fps=30)
    print(f"Video saved to: {video_path}")
    print(f"Total frames: {len(frames)}")
else:
    print("No frames recorded!")

In [None]:
# Display video in notebook
if video_path.exists():
    display(Video(str(video_path), embed=True, width=640))
else:
    print("Video file not found!")

## 11. Test with Custom Target (Optional)

In [None]:
def test_custom_target(model, target_x=0.30, target_y=0.15):
    """Test the model with a custom target location."""
    
    env = PickPlaceEnv(
        render_mode="rgb_array",
        max_episode_steps=400,
        place_target=(target_x, target_y),
        randomize_cube=False,
        randomize_target=False,
    )
    
    vec_env = DummyVecEnv([lambda: env])
    if vec_normalize_path.exists():
        vec_env = VecNormalize.load(vec_normalize_path, vec_env)
        vec_env.training = False
        vec_env.norm_reward = False
    
    obs = vec_env.reset()
    frames = []
    done = False
    
    print(f"Testing with target at ({target_x}, {target_y})...")
    
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_env.step(action)
        frame = env.render()
        if frame is not None:
            frames.append(frame)
    
    success = info[0].get("is_success", False)
    print(f"Result: {'SUCCESS ✅' if success else 'FAILED ❌'}")
    
    # Save video
    custom_video_path = output_dir / f"custom_target_{target_x}_{target_y}.mp4"
    imageio.mimsave(str(custom_video_path), frames, fps=30)
    print(f"Video saved to: {custom_video_path}")
    
    env.close()
    return custom_video_path

# Test with a different target
custom_video = test_custom_target(eval_model, target_x=0.30, target_y=0.15)

In [None]:
# Display custom target video
if custom_video.exists():
    display(Video(str(custom_video), embed=True, width=640))

## 12. Cleanup

In [None]:
# Close environments
train_env.close()
eval_env.close()

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)
print(f"\nAll outputs saved to: {output_dir}")
print(f"\nFiles:")
for f in output_dir.rglob("*"):
    if f.is_file():
        print(f"  - {f.relative_to(output_dir)}")