In [2]:
import os
import sys

sys.path.append(os.path.join(os.getcwd(), '..'))

from schemas.evaluation import EvaluationRequest
from agents.evaluation_agent import run_evaluation_agent
from models.feature import FeatureSpec
from models.model import ModelEnum

import pandas as pd

In [3]:
features = [
    FeatureSpec(name="Age", dtype="numeric", origin="raw", transformer="none"),
    FeatureSpec(name="Fare", dtype="numeric", origin="raw", transformer="none"),
    FeatureSpec(name="Sex", dtype="categorical", origin="raw", transformer="none"),
]

model_info = {
    "model": "RandomForest",
    "max_depth": 5,
    "n_estimators": 100,
}

In [4]:
req = EvaluationRequest(
    metadata={
        "dataset_name": "Titanic",
        "problem_type": "classification",
        "target_column": "Survived",
    },
    selected_features=features,
    model_name=ModelEnum.RANDOMFOREST,
    hyperparameters={"max_depth": 5, "n_estimators": 100}
)

In [5]:
current_metrics = {"f1_score": 0.72, "accuracy": 0.81}
history = [
    {"f1_score": 0.68, "accuracy": 0.78},
    {"f1_score": 0.70, "accuracy": 0.80},
]
optimization_goal = "maximize recall, avoid overfitting"

In [6]:
decision = run_evaluation_agent(
    request=req,
    current_metrics=current_metrics,
    history=history,
    model_info=model_info,
    optimization_goal=optimization_goal
)

[32m2025-05-30 20:27:55.143[0m | [1mINFO    [0m | [36magents.evaluation_agent[0m:[36mrun_evaluation_agent[0m:[36m110[0m - [1mRunning evaluation agent for dataset 'Titanic'[0m
[32m2025-05-30 20:27:55.146[0m | [1mINFO    [0m | [36magents.evaluation_agent[0m:[36mrun_evaluation_agent[0m:[36m121[0m - [1mPrompt length: 442 characters[0m
[32m2025-05-30 20:27:57.315[0m | [1mINFO    [0m | [36magents.evaluation_agent[0m:[36mrun_evaluation_agent[0m:[36m125[0m - [1mLLM decision: continue[0m


In [7]:
print("Recommendation:", decision.recommendation)
print("Reasoning:", decision.reasoning)
print("Confidence:", decision.confidence)

Recommendation: continue
Reasoning: The current model has shown improvement in both f1_score and accuracy compared to the previous iterations. The metrics are stable and indicate a reasonable performance without signs of overfitting. It is recommended to continue training the model with the current configuration to further improve performance.
Confidence: 0.8
