In [16]:
from langsmith import Client

import sys
import os
from dotenv import load_dotenv

# Add the project root to Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Import agent
from backend.agent.agent import graph as talent_hunter_agent

ResponseHandlingException: [Errno 61] Connection refused

In [None]:
# Load environment variables
load_dotenv()

# Instantiate the client
client = Client()

# Programmatically create a dataset in LangSmith
# For other dataset creation methods, see:
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application
DATASET_NAME = "Talent Hunter Agent Queries"



In [11]:
# Retrieve datasets in the langsmith client
datasets = list(client.list_datasets(dataset_name=DATASET_NAME))

if len(datasets) == 0:
    dataset = client.create_dataset(
        dataset_name=DATASET_NAME,
        description="Contains queries to the Talent Hunter Agent, both from the job seeker and the company."
    )
else:
    dataset = datasets[0]

In [None]:
# Create 10 example queries from the job seeker
examples = [
    {
        "input": "I’m looking for companies hiring junior backend developers.",
        "source": "Job Seeker"
    },
    {
        "input": "Can you recommend companies that match my resume?",
        "source": "Job Seeker"
    },
    {
        "input": "Here's my CV. Can you find me relevant job listings?",
        "source": "Job Seeker"
    },
    {
        "input": "I want to register myself as a job seeker.",
        "source": "Job Seeker"
    },
    {
        "input": "Hi, my name is Sarah. I'm a software engineer. Can you help?",
        "source": "Job Seeker"
    },
    {
        "input": "Do you have companies hiring for data analyst roles?",
        "source": "Job Seeker"
    },
    {
        "input": "Can you help me submit my resume?",
        "source": "Job Seeker"
    },
    {
        "input": "I’m looking for part-time work. Can you search for me?",
        "source": "Job Seeker"
    },
    {
        "input": "Can I upload my resume to get job matches?",
        "source": "Job Seeker"
    },
    {
        "input": "I’m a recent graduate in marketing. Any leads?",
        "source": "Job Seeker"
    },
    {
        "input": "We need to find candidates for a UI/UX designer role.",
        "source": "Company"
    },
    {
        "input": "Can you register our company?",
        "source": "Company"
    },
    {
        "input": "We’re hiring again. How do we get matched with applicants?",
        "source": "Company"
    },
    {
        "input": "This is NextGen Labs. We're looking for a project manager.",
        "source": "Company"
    },
    {
        "input": "Can I add our company to your platform?",
        "source": "Company"
    },
    {
        "input": "We’re searching for Python developers with 2 years of experience.",
        "source": "Company"
    },
    {
        "input": "Our company needs interns. Can we register here?",
        "source": "Company"
    },
    {
        "input": "Can you show me candidates who specialize in cybersecurity?",
        "source": "Company"
    },
    {
        "input": "I'd like to post a job for an HR assistant role.",
        "source": "Company"
    },
    {
        "input": "Do you have mobile developers in your database?",
        "source": "Company"
    }
]

# Insert input to dataset
client.create_examples(dataset_name=DATASET_NAME, examples=examples)


{'example_ids': ['a0175335-0076-4f88-a1b7-c41ceaceade0',
  'ce8d8cb8-d5b8-4857-afcb-00bbccd05038',
  'e2d0d175-5d41-416b-97f3-3f48edbe8d93',
  '3905b640-f139-45fe-b69a-478c6723e63b',
  '02098fe1-0f1a-4d16-a7ec-4aa8a04437dc',
  '1f62fce5-7955-4344-adbe-180d0cae814a',
  'c910c225-c33c-40c5-a8d6-3f6a351a421a',
  '6a6f6431-8254-48a0-bf8d-7074cc21d881',
  '461c3568-de40-44f8-b77a-71ae65f853b4',
  '20c6b01d-46f4-4c0f-817d-cec02572c9e9',
  '1c4a14aa-91c2-4147-8e79-ddea8e388df7',
  '14336766-4247-464e-b2fd-c4501ac00865',
  '1d68a39e-adff-442b-b230-4e1a471ca406',
  '18394c66-db6c-46ed-800c-5aaf91658afe',
  'c41c9054-ffeb-416e-9296-708518ea583b',
  '763ef424-5ed5-492c-84ea-386c60444cc3',
  'ead1607b-8454-42b7-b199-b0fdc18c4eb0',
  '46f05967-1616-435d-b0e8-ed9a3cbc6b20',
  '927b3ad2-ba7d-49be-83a3-ce844b046143',
  'e9dd033c-d5a4-48d9-b864-7c96447cd22c'],
 'count': 20}

In [None]:
def invoke_agent(inputs: dict) -> dict:
    # for question in questions:
    config = {
        "configurable": {
            "thread_id": None,
            "user_id": "user_1"
        },
        "recursion_limit": 25 # Overwrites the default recursion of 25.
    }

    # Invoke the agent
    response = talent_hunter_agent.invoke({
        "messages": [
            ("human", inputs['input'])
        ]
    }, config)
    
    # Output the last message
    response = response['messages'][-1].content
    
    return {"output": response}

In [None]:
from openevals.llm import create_llm_as_judge
from langchain.chat_models import init_chat_model

eval_llm = init_chat_model(
    "gpt-4o",               
    model_provider="openai",
    temperature=0,
    verbose=True
)

def intent_identification_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt="Evaluate the correctness of the agent's ability to identify the user's intent (insert candidate, insert company, search candidates, search companies).",
        model="openai:gpt-40",
        feedback_key="intent_identification_accuracy",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs
    )
    return eval_result

def proactiveness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt="Evaluate how proactive the agent is in asking for missing information.",
        model="openai:gpt-40",
        feedback_key="proactiveness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs
    )
    return eval_result

def clarity_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt="Evaluate the clarity of the agent's response.",
        model="openai:gpt-40",
        feedback_key="clarity",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs
    )
    return eval_result

In [None]:
experiment_results = client.evaluate(
    invoke_agent,
    data=DATASET_NAME,
    evaluators=[
        intent_identification_evaluator,
        proactiveness_evaluator,
        clarity_evaluator
    ],
    experiment_prefix="talent-hunter-agent-evaluation",
    max_concurrency=2,
)