In [1]:
import pandas as pd
from typing import List, Optional

In [2]:
emp_df = pd.read_csv('employee_skills_dataset.csv')
emp_df.head()

Unnamed: 0,ID,Name,Role,Seniority,Status,Primary_Stack,Skills,Years_Experience,Last_Project_Domain,Last_Project_Stack,Availability_Score,Bio
0,101,Alice Johnson,Full Stack Developer,Senior,Active,Python/React,"Python (5), Django (4), React (4), AWS (3), Po...",7,FinTech,"Python, Django, React, AWS",90,Alice has led the development of a high-freque...
1,102,Bob Smith,Data Scientist,Staff,Active,Python/Pandas,"Python (4), Pandas (4), Scikit-learn (3), SQL ...",4,Healthcare,"Python, Flask, MongoDB",85,Bob recently built predictive models for patie...
2,103,Charlie Brown,Front-End Developer,Junior,Active,React/TypeScript,"React (3), TypeScript (2), HTML/CSS (4), Redux...",2,E-commerce,"React, Node.js",75,Charlie contributed significantly to optimizin...
3,104,Diana Prince,Backend Developer,Senior,Active,Java/Spring,"Java (5), Spring Boot (4), Kafka (3), Microser...",10,Enterprise Banking,"Java, Spring, Oracle",95,"As an expert in Enterprise Banking, Diana arch..."
4,105,Ethan Hunt,DevOps Engineer,Expert,Active,AWS/Kubernetes,"Kubernetes (5), Terraform (5), AWS (5), CI/CD ...",8,Cloud Infrastructure,"Kubernetes, Terraform, AWS",100,"Ethan specializes in building fully automated,..."


In [6]:
emp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   15 non-null     int64 
 1   Name                 15 non-null     object
 2   Role                 15 non-null     object
 3   Seniority            15 non-null     object
 4   Status               15 non-null     object
 5   Primary_Stack        15 non-null     object
 6   Skills               15 non-null     object
 7   Years_Experience     15 non-null     int64 
 8   Last_Project_Domain  15 non-null     object
 9   Last_Project_Stack   15 non-null     object
 10  Availability_Score   15 non-null     int64 
 11  Bio                  15 non-null     object
dtypes: int64(3), object(9)
memory usage: 1.5+ KB


In [5]:
emp_df['Skills'] = emp_df['Skills'].astype(str)

In [51]:
# query employee data tool
def query_employee_data(
    required_skill: str,
    min_seniority: Optional[str] = None,
    min_years_experience: Optional[int] = 0,
    required_domain: Optional[str] = None,
    status_filter: Optional[List[str]] = None,
) -> str:

    """
    Queries the employee database (DataFrame) to find candidates matching the specified criteria.
    This function acts as the 'tool' the Agent uses.

    Args:
        required_skill: A mandatory skill the employee must possess (e.g., 'Python', 'Kubernetes').
        min_seniority: Minimum seniority level (e.g., 'Mid', 'Senior', 'Expert').
        min_years_experience: Minimum years of experience required.
        required_domain: Specific industry experience required (e.g., 'FinTech', 'Healthcare').
        status_filter: List of allowed statuses (e.g., ['Active', 'Bench']). Defaults to 'Active'.

    Returns:
        A JSON string containing the relevant columns of up to 10 matching employees.
    """

    # 1. Start with all active employees if no status filter is provided
    if status_filter is None:
        status_filter = ['Active']

    filtered_df = emp_df[emp_df['Status'].isin(status_filter)].copy()

    # 2. Mandatory Skill Filtering (Search within the 'Skills' column)
    if required_skill:
        filtered_df = filtered_df[filtered_df['Skills'].str.contains(required_skill, case=False, na=False)]

    # 3. Minimum Years of Experience Filtering
    filtered_df = filtered_df[filtered_df['Years_Experience'] >= min_years_experience]

    # 4. Domain Filtering
    if required_domain:
        filtered_df = filtered_df[filtered_df['Last_Project_Domain'].str.contains(required_domain, case=False, na=False)]

    # 5. Seniority Filtering (Simple approach: Seniority is ranked Expert > Senior > Mid > Junior)
    if min_seniority:
        seniority_rank = {'Junior': 1, 'Staff': 2, 'Senior': 3, 'Expert': 4}
        min_rank = seniority_rank.get(min_seniority, 0)

        # Filter where the employee's rank is greater than or equal to the minimum required rank
        filtered_df = filtered_df[filtered_df['Seniority'].map(seniority_rank) >= min_rank]


    # 6. Prepare Output (Focus on what the Agent needs to see for scoring)
    result_cols = ['ID', 'Name', 'Role', 'Seniority', 'Skills', 'Years_Experience', 'Last_Project_Domain', 'Availability_Score', 'Bio']

    if filtered_df.empty:
        return "No employees found matching all mandatory criteria."

    # Return the top 10 results, converted to a JSON string for the Agent
    return filtered_df[result_cols].head(10).to_json(orient='records')

In [52]:
# Registering the Tool to Langchain
from langchain.tools import tool

# @tool decorator makes the function callable by the LLM Agent
@tool
def employee_data_tool(
    required_skill: str,
    min_seniority: Optional[str] = None,
    min_years_experience: Optional[int] = 0,
    required_domain: Optional[str] = None,
    status_filter: Optional[List[str]] = None,
) -> str:
    """
    TOOL: Queries the employee database (DataFrame) for candidates.
    Use this tool FIRST to find a pool of employees based on mandatory filters
    like skill, experience, seniority, and domain. The required_skill parameter is mandatory.
    Returns a JSON string of up to 10 matching employees.
    """
    # Call the actual worker function
    return query_employee_data(required_skill, min_seniority, min_years_experience, required_domain, status_filter)

# The 'employee_data_tool' variable is now the component you pass to your LangChain Agent.

## Scoring Mechanism:
Weighted Scoring

In [10]:
import json
from typing import Dict, Any, List, Tuple

# Helper function
def parse_skills(skills_str: str) -> Dict[str, int]:
    """Converts the 'Python (5), React (4)' string into a dictionary."""
    skills_dict = {}
    try:
        for item in skills_str.split(','):
            if '(' in item and ')' in item:
                name, score_str = item.split('(')
                score = int(score_str.replace(')', '').strip())
                skills_dict[name.strip().lower()] = score
    except Exception:
        return {}
    return skills_dict

def calculate_fit_score(
    candidate_json: str,
    required_skills: List[Tuple[str, int]], # E.g., [('python', 4), ('react', 3)]
    min_years: int,
    target_domain: str,
) -> Dict[str, Any]:
    """
    Calculates the Objective Fit Score (0-100) for a single employee candidate
    based on quantifiable, structured requirements.
    """
    try:
        candidate = json.loads(candidate_json)
    except json.JSONDecodeError:
        return {"error": "Invalid candidate JSON format."}

    total_score = 0
    score_breakdown = {}

    # ------------------
    # A. Skill Match (Max 40 points)
    # ------------------
    skill_score = 0
    candidate_skills = parse_skills(candidate.get('Skills', ''))

    if required_skills:
        skill_weight = 40 / len(required_skills)
        for skill, min_prof in required_skills:
            candidate_prof = candidate_skills.get(skill.lower(), 0)

            if candidate_prof >= min_prof:
                skill_score += (candidate_prof / 5) * skill_weight

    total_score += skill_score
    score_breakdown['Skill_Match'] = round(skill_score, 2)

    # ------------------
    # B. Years of Experience (Max 20 points)
    # ------------------
    years = candidate.get('Years_Experience', 0)
    # Scales experience up to a maximum score reached at 10 years
    exp_score = min(20, (years / 10) * 20)
    total_score += exp_score
    score_breakdown['Experience'] = round(exp_score, 2)

    # ------------------
    # C. Domain Experience (Max 15 points)
    # ------------------
    domain_score = 0
    last_domain = candidate.get('Last_Project_Domain', '').lower()
    if target_domain.lower() in last_domain:
        domain_score = 15

    total_score += domain_score
    score_breakdown['Domain_Experience'] = domain_score

    # ------------------
    # D. Availability (Max 15 points)
    # ------------------
    availability = candidate.get('Availability_Score', 0)
    avail_score = (availability / 100) * 15
    total_score += avail_score
    score_breakdown['Availability'] = round(avail_score, 2)

    # ------------------
    # E. Stack Match (Max 10 points)
    # ------------------
    stack_score = 0
    required_stack_keywords = [s[0] for s in required_skills]

    if any(keyword in candidate.get('Primary_Stack', '').lower() or keyword in candidate.get('Last_Project_Stack', '').lower() for keyword in required_stack_keywords):
        stack_score = 10

    total_score += stack_score
    score_breakdown['Stack_Match'] = stack_score

    # ------------------
    # Final Result
    # ------------------
    return {
        "ID": candidate.get('ID'),
        "Name": candidate.get('Name'),
        # This is the objective, deterministic score
        "Objective_Fit_Score": round(total_score, 2),
        "Breakdown": score_breakdown
    }

In [12]:
# Register it to Langchain

from langchain.tools import tool

@tool
def objective_fit_scoring_tool(
    candidate_json: str,
    required_skills_list: List[Tuple[str, int]],
    min_years_experience: int,
    target_project_domain: str,
) -> str:
    """
    TOOL: Calculates the OBJECTIVE Fit Score (0-100) for a single candidate.
    This score is based on quantifiable data: skill proficiency, years of experience, and availability.
    The LLM Agent must call this for all filtered candidates before final ranking.
    Returns a JSON string with the final Objective_Fit_Score.
    """
    # Calls the worker function
    result_dict = calculate_fit_score(candidate_json, required_skills_list, min_years_experience, target_project_domain)
    return json.dumps(result_dict)

# Note: We renamed the tool to clearly indicate it calculates the 'Objective' score.

## Semantic Scoring Tool using Cosine Similarity

In [13]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from typing import List, Dict, Any, Tuple
import json

# --- 1. Load Data (Using the same structure as before) ---
employee_df = pd.read_csv('employee_skills_dataset.csv')

# --- 2. Initialize Vector Store and Embedder ---

# Use a highly effective and fast sentence transformer model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize ChromaDB (in-memory for this example, or on disk for persistence)
client = chromadb.Client()
# Create a collection to store the embeddings and metadata
bio_collection = client.get_or_create_collection(
    name="employee_bios",
    embedding_function=None # We will handle embedding externally
)

# --- 3. Embed and Load Data into ChromaDB ---

documents = employee_df['Bio'].tolist()
ids = [str(i) for i in employee_df['ID'].tolist()]
# Convert all other columns to metadata for retrieval
metadatas = employee_df.drop(columns=['Bio']).to_dict('records')

# Generate embeddings
embeddings = embedding_model.encode(documents).tolist()

# Load into ChromaDB
bio_collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

  from .autonotebook import tqdm as notebook_tqdm


### The semantic match tool

In [16]:
def calculate_semantic_score(
    project_description: str,
    candidate_id: str,
) -> float:
    """
    Calculates the semantic match score (Cosine Similarity) between the
    Project Manager's detailed request and a single employee's bio.

    Args:
        project_description: The detailed description of the type of person needed.
        candidate_id: The ID of the employee to check (must be a string).

    Returns:
        A score between 0.0 and 1.0 representing the semantic match.
    """

    # 1. Embed the project description
    query_embedding = embedding_model.encode(project_description).tolist()

    # 2. Query the vector store for the specific employee
    # We use a query with where_document filtering to get the single document's embedding
    results = bio_collection.query(
        query_embeddings=[query_embedding],
        # Only check the specific candidate ID
        where={"ID": int(candidate_id)},
        n_results=1,
    )

    # Check if a result was found
    if not results or not results['distances'] or not results['distances'][0]:
        return 0.0

    # ChromaDB's query returns the distance. We need the similarity score.
    # Note: If using L2/Euclidean distance, similarity is not directly distance.
    # However, since we queried one embedding against another, the distance
    # metric returned by ChromaDB is often based on the similarity metric
    # of the underlying space (often Cosine). For simplicity and focusing
    # on the match rank, we'll rely on the distance proxy or re-calculate.

    # A more reliable method is to retrieve the candidate's vector and manually
    # calculate the cosine similarity, but we'll use Chroma's features here.

    # For a direct Cosine Similarity calculation (if Chroma wasn't handling it):
    from sklearn.metrics.pairwise import cosine_similarity

    # Get the candidate's stored embedding
    candidate_result = bio_collection.get(ids=[candidate_id])
    if not candidate_result or not candidate_result['embeddings']:
        return 0.0

    candidate_embedding = candidate_result['embeddings'][0]

    # Calculate similarity (returns a 1x1 array, take the single value)
    similarity = cosine_similarity([query_embedding], [candidate_embedding])[0][0]

    # Cosine Similarity is between -1 (opposite) and 1 (identical). We normalize
    # it to 0-1 for scoring, assuming embeddings are generally positive.
    # A standard normalization for positive-focused text embeddings (0 to 1):
    return max(0.0, (similarity + 1) / 2)

def semantic_match_tool_worker(
    project_description: str,
    candidate_id: int,
) -> float:
    """Worker function for the LLM tool."""
    return calculate_semantic_score(project_description, str(candidate_id))

In [28]:
# Registering the tool into Langchain

from langchain.tools import tool

@tool
def semantic_match_tool(
    project_description: str,
    candidate_id: int,
) -> str:
    """
    TOOL: Calculates the SEMANTIC Match Score (0.0 - 1.0) for a single candidate.
    This score measures how well the candidate's Bio text semantically aligns with the
    project's required description using Cosine Similarity on vector embeddings.

    Args:
        project_description: The detailed, descriptive project needs from the manager.
        candidate_id: The ID of the employee (integer) to check.

    Returns:
        A JSON string containing the 'Semantic_Score' for the candidate.
    """

    # Get the float score
    score = semantic_match_tool_worker(project_description, candidate_id)

    # Return as JSON string for the Agent
    return json.dumps({
        "ID": candidate_id,
        "Semantic_Score": round(score, 4)
    })

## The Agent Orchestration

In [54]:
SYSTEM_PROMPT = """
You are the **TalentMatch AI Agent**, an expert Project Member Allocator.
YOUR SOLE PURPOSE IS TO FIND AND RECOMMEND THE TOP 5 CANDIDATES.
You MUST use the provided tools to answer the request.
When the user asks for a candidate based on a skill (e.g., 'Selenium', 'Python', 'React'), your first and only action MUST be to call the 'employee_data_tool'.
DO NOT apologize or state you cannot help. Use the tools to complete the task.

**Process:**
1.  **Parse Request:** Analyze the user's input and extract structured requirements (required_skill, min_years, target_domain) and the detailed, descriptive part (for semantic search).
2.  **Filter (Tool 1):** Use the `employee_data_tool` once to filter the large database down to a pool of 10-15 viable candidates based on mandatory criteria.
3.  **Score (Tool 2 & 3):** Iterate through the filtered candidates, using two tools for a comprehensive score:
    * Call `objective_fit_scoring_tool` to get the quantifiable score (0-100).
    * Call `semantic_match_tool` to get the narrative match score (0.0-1.0).
4.  **Rank & Synthesize:** Combine the two scores (70% Objective, 30% Semantic), rank the employees, and generate a concise, persuasive natural language summary for the top 5.

**Tools available to you:**
- `employee_data_tool`: For initial filtering.
- `objective_fit_scoring_tool`: For deterministic, weighted skill/experience scoring (returns 0-100).
- `semantic_match_tool`: For semantic alignment scoring (returns 0.0-1.0).

Always use the tools first before generating the final response.
"""

In [21]:
# Langgraph class

from typing import TypedDict, List, Tuple, Optional, Dict, Any

# Define the data structure for a single candidate that moves through the pipeline
class ScoredCandidate(TypedDict):
    ID: int
    Name: str
    Bio: str
    Objective_Score: float
    Semantic_Score: float
    Final_Score: float

# Define the overall graph state
class AgentState(TypedDict):
    """The state of the agent's workflow."""

    # 1. Input Data
    user_request: str

    # 2. Parsed Criteria (Agent's interpretation)
    required_skill: Optional[str]
    min_years: int
    target_domain: Optional[str]
    project_description: Optional[str] # Detailed text for semantic tool

    # 3. Intermediate Data
    filtered_candidates: str # JSON list from employee_data_tool

    # 4. Final Results
    scored_candidates: List[ScoredCandidate]
    final_recommendations: str # The natural language summary

    # Tool invocation
    tool_calls: List[Tuple[str, str]]

### 1. The LLM analyzes the user input and calls the first tool.

In [22]:
# (Simplified Node definition for clarity - LangGraph requires defining an App with Nodes/Edges)

def parse_and_filter(state: AgentState):
    # 1. LLM Call to Parse
    # Agent analyzes state['user_request'] and extracts:
    #   required_skill, min_years, target_domain, project_description

    # 2. LLM decides to call the first tool
    tool_input = {
        "required_skill": state['required_skill'],
        "min_years_experience": state['min_years'],
        # ... other filter parameters
    }

    # 3. Execute the employee_data_tool with the parsed input
    filtered_json = employee_data_tool(**tool_input)

    # Update state
    return {
        "filtered_candidates": filtered_json,
        "tool_calls": [("employee_data_tool", str(tool_input))]
    }

### Iterative Scoring
This is where the agent iterates and calls the two scoring tools for each of the 10 filtered candidates.

In [23]:
def score_candidates(state: AgentState):
    # Retrieve filtered candidates and project criteria
    candidates = json.loads(state['filtered_candidates'])
    scored_list = []

    for candidate_json in candidates:
        candidate_dict = json.loads(candidate_json)

        # --- Call Objective Scoring Tool ---
        obj_input = {
            "candidate_json": candidate_json,
            # Pass required criteria to the scoring tool
            "required_skills_list": [('python', 4), ('react', 3)], # Example: parsed from user_request
            "min_years_experience": state['min_years'],
            "target_project_domain": state['target_domain'],
        }
        obj_result = objective_fit_scoring_tool(**obj_input)
        obj_score_data = json.loads(obj_result)

        # --- Call Semantic Scoring Tool ---
        sem_input = {
            "project_description": state['project_description'],
            "candidate_id": candidate_dict['ID'],
        }
        sem_result = semantic_match_tool(**sem_input)
        sem_score_data = json.loads(sem_result)

        # --- Combine Scores (70% Objective, 30% Semantic) ---
        obj_score = obj_score_data.get('Objective_Fit_Score', 0)
        sem_score = sem_score_data.get('Semantic_Score', 0) * 100 # Scale semantic score to 100

        final_score = (obj_score * 0.70) + (sem_score * 0.30)

        # Build the final candidate object
        scored_list.append({
            "ID": candidate_dict['ID'],
            "Name": candidate_dict['Name'],
            "Bio": candidate_dict['Bio'],
            "Objective_Score": obj_score,
            "Semantic_Score": sem_score / 100, # Store 0.0-1.0
            "Final_Score": round(final_score, 2)
        })

    # Update state
    return {"scored_candidates": scored_list}

### Final Summary Generation

In [24]:
def generate_summary(state: AgentState):
    # 1. Rank the candidates
    final_ranking = sorted(
        state['scored_candidates'],
        key=lambda x: x['Final_Score'],
        reverse=True
    )[:5] # Take Top 5

    # 2. LLM Call for Synthesis
    # Craft a detailed prompt instructing the LLM to act as the final reporter.
    synthesis_prompt = f"""
    The final ranking of candidates is: {json.dumps(final_ranking)}.
    The original project description was: "{state['user_request']}".

    Generate a professional recommendation report:
    1. List the Top 5 candidates by Final_Score.
    2. For each candidate, provide a summary (2-3 sentences) based on their Bio and score breakdown.
    3. Clearly state their Final_Score and briefly mention why they are a strong fit (e.g., "High Objective score due to Python(5) proficiency, plus a high Semantic score from extensive FinTech experience").
    """

    # LLM executes the prompt and generates the final text (using Mistral 7B)
    final_report = "..." # Placeholder for the LLM's generated text

    return {"final_recommendations": final_report}

## Implementing the LLM

In [44]:
import os
from dotenv import load_dotenv

# Load environment variables from a .env file (recommended)
load_dotenv()

# Set up your API key for OpenRouter (or Mistral/OpenAI if using a different setup)
# Ensure OPENROUTER_API_KEY is set in your .env file
api_key = os.getenv('OPENROUTER_API_KEY')
MODEL_NAME = "mistralai/mistral-7b-instruct"

In [29]:
# Assuming the tool definitions from previous steps are accessible:
# employee_data_tool
# objective_fit_scoring_tool
# semantic_match_tool

# List of all tools the agent can use
tools = [
    employee_data_tool,
    objective_fit_scoring_tool,
    semantic_match_tool
]

print(f"Agent initialized with {len(tools)} tools.")

Agent initialized with 3 tools.


In [55]:
from langchain_openai import ChatOpenAI
from langchain import hub
from langchain.agents import AgentExecutor, create_tool_calling_agent

# --- Initialize the LLM ---
# Using the standard OpenAI SDK wrapper for OpenRouter compatibility.
# The 'base_url' directs calls to OpenRouter.
# Adjust to ChatMistralAI if you prefer the Mistral specific SDK.
llm = ChatOpenAI(
    model=MODEL_NAME,
    temperature= 0.7,
    api_key= api_key,
    base_url="https://openrouter.ai/api/v1",
    timeout= 60.0
)

# --- Define the Prompt ---
# Use the SYSTEM_PROMPT defined in the Orchestration step (must be available in the notebook)
# For simplicity, we use a standard tool-calling template and inject our system prompt
prompt = hub.pull("hwchase17/openai-tools-agent")
prompt = prompt.partial(system_prompt=SYSTEM_PROMPT)

# --- Create the Agent ---
agent = create_tool_calling_agent(llm, tools, prompt)

# --- Create the Executor ---
# The AgentExecutor handles the loop: LLM decides tool -> Tool runs -> LLM sees result -> repeat
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

print("AgentExecutor is ready. Time to test the end-to-end workflow!")



AgentExecutor is ready. Time to test the end-to-end workflow!


In [56]:
# The User's Request (The full text the manager would provide)
test_request = """
I need a staff who is expert in Selenium
"""

print(f"\n--- Running Test for Request: '{test_request}' ---")

try:
    # Run the Agent
    result = agent_executor.invoke({"input": test_request})

    # Print the final output generated by the LLM
    print("\n\n--- FINAL AGENT RECOMMENDATION ---")
    print(result['output'])

except Exception as e:
    print(f"\n--- ERROR DURING EXECUTION ---")
    print(f"An error occurred: {e}")
    print("Check your API key, tool definitions, and data setup (CSV/ChromaDB).")


--- Running Test for Request: '
I need a staff who is expert in Selenium
' ---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mSure, I will follow the below process to find the best fit.
1. First, I will search the database for candidates with Selenium skills, considering seniority, experience, and domain if provided. If no candidates are found, I will suggest checking the filters or expanding the search criteria.
2. Next, I will calculate the objective fit score for each candidate based on their Selenium proficiency, experience, and availability.
3. Finally, I will calculate the semantic match score by comparing the candidate's bio with the project description to ensure a cultural and project fit. After evaluating all candidates, I will recommend the best matches based on the combined objective fit and semantic match scores.

Please provide the minimum years of experience, seniority level, and domain for the search.[0m

[1m> Finished chain.[0m


--- FINAL AGENT RECOMM