# T4-OPT: Multi-Agent System

This notebook demonstrates the multi-agent orchestration system.

## Agents:
- **PlannerAgent**: Breaks down tasks into steps
- **TrainingAgent**: Handles QLoRA training
- **OptimizeAgent**: Handles quantization
- **EvalAgent**: Runs evaluations
- **RecoveryAgent**: Handles failures and recovery


In [None]:
import sys
sys.path.append('/content/t4opt')

from agents.planner import PlannerAgent
from agents.trainer import TrainingAgent
from agents.optimizer import OptimizeAgent
from agents.evaluator import EvalAgent
from agents.recovery import RecoveryAgent
from utils.logger import Logger

# Initialize logger
logger = Logger(name="T4-OPT-Agents")


In [None]:
# Initialize agents
planner = PlannerAgent()
planner.set_logger(logger)

trainer_agent = TrainingAgent()
trainer_agent.set_logger(logger)

optimizer_agent = OptimizeAgent()
optimizer_agent.set_logger(logger)

eval_agent = EvalAgent()
eval_agent.set_logger(logger)

recovery_agent = RecoveryAgent()
recovery_agent.set_logger(logger)

print("✅ All agents initialized")


In [None]:
# Example: Plan a training task
task = "Train a language model using QLoRA on the alpaca dataset"

plan_result = planner.execute(task=task)
print(f"Task Type: {plan_result.result['task_type']}")
print(f"Number of Steps: {plan_result.result['steps']}")
print(f"Estimated Time: {plan_result.result['estimated_time']['total_minutes']:.1f} minutes")
print("\nPlan:")
for step in plan_result.result['plan']:
    print(f"  {step['step_id']}. {step['action']} ({step['agent']})")
    print(f"     {step['description']}")


In [None]:
# Example: Execute training task
training_context = {
    "model_name": "microsoft/phi-2",
    "dataset_name": "alpaca",
    "max_samples": 500,
    "output_dir": "./checkpoints/agent-trained",
    "num_epochs": 1  # Reduced for demo
}

# Note: This will actually train - uncomment to run
# training_result = trainer_agent.execute(
#     task="run_training",
#     context=training_context
# )
# print(f"Training Status: {training_result.status}")

print("Training agent ready (commented out to avoid long execution)")


In [None]:
# Example: Check for recovery
recovery_result = recovery_agent.execute(
    task="check_checkpoint",
    context={"checkpoint_dir": "./checkpoints"}
)

if recovery_result.result['can_resume']:
    print(f"✅ Checkpoint found: {recovery_result.result['latest_checkpoint']['name']}")
    print("   Can resume training")
else:
    print("❌ No checkpoint found")


In [None]:
# Example: Full pipeline orchestration
def run_full_pipeline(task_description):
    """Run a full pipeline using agents."""
    # 1. Plan
    plan = planner.execute(task=task_description)
    if plan.status != "completed":
        return {"error": "Planning failed", "plan": plan}
    
    # 2. Execute steps (simplified - in production, handle dependencies)
    results = []
    context = {}
    
    for step in plan.result['plan']:
        agent_name = step['agent']
        action = step['action']
        
        if agent_name == "TrainingAgent":
            result = trainer_agent.execute(task=action, context=context)
        elif agent_name == "OptimizeAgent":
            result = optimizer_agent.execute(task=action, context=context)
        elif agent_name == "EvalAgent":
            result = eval_agent.execute(task=action, context=context)
        else:
            continue
        
        results.append(result)
        if result.status == "completed" and result.result:
            context.update(result.result)
    
    return {"plan": plan, "results": results}

# Example usage (commented to avoid execution)
# pipeline_result = run_full_pipeline("Train and evaluate a model")
# print("Pipeline execution complete")
