In [1]:
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
GOOGLE_API_KEY=os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [4]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
llm.invoke("Hello!").content

'Hello there! How can I help you today?'

In [6]:
from typing import TypedDict, Dict

#Reason of creating this state is that we will let this value goes through all node and get updated.
class State(TypedDict):
    """
    Represents th state of our graph.

    Attributes:
        md_text: contains HR Reimbursement Policy
        employee_invoice_data: contains employee invoice data
    """

    md_text: str
    employee_invoice_data: Dict[str, Dict[str, str]]

### Extracting HR Reimbursement Policy

In [7]:
import pymupdf4llm
import fitz  # PyMuPDF
from PIL import Image
import io
import base64
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage

def extract_hr_policy_from_pdf(state: State, pdf_path: str) -> State:
    """
    Extract HR reimbursement policy from PDF.
    
    Args:
        state: LangGraph state to update
        pdf_path: Path to the PDF file
        
    Returns:
        Updated state with md_text containing extracted policy
    """
    try:
        #First I will try to extract directly from pdf if not able to do then will use vision model in our case GEMINI
        md_text = pymupdf4llm.to_markdown(pdf_path)
        
        # If no text extracted, convert PDF to images and feed to gemini
        if not md_text or md_text.strip() == "":
            
            pdf_document = fitz.open(pdf_path)
            all_extracted_text = []
            
            for page_num in range(len(pdf_document)):
                # Convert page to PNG image
                page = pdf_document[page_num]
                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for clarity
                img_data = pix.tobytes("png")
                
                # Convert to base64 as mentioned in Langchain-google-genai documentation: https://python.langchain.com/docs/integrations/chat/google_generative_ai/
                img_base64 = base64.b64encode(img_data).decode()
                
                # Create message with image and prompt
                message = HumanMessage(
                    content=[
                        {
                            "type": "text",
                            "text": "Extract the HR Reimbursement policy from this image. Return the text in markdown format."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{img_base64}"
                            }
                        }
                    ]
                )
                
                # Get extracted text from vision model
                response = llm.invoke([message])
                all_extracted_text.append(response.content)
            
            pdf_document.close()
            
            # Combine all page texts
            md_text = "\n\n".join(all_extracted_text)
        
        # Update state with extracted text
        state["md_text"] = md_text
        # print(md_text)
    except Exception as e:
        print(f"Error processing PDF: {e}")
        state["md_text"] = ""
    
    return state

In [8]:
current_state = State(md_text="")

extract_hr_policy_from_pdf(current_state, "C:/Users/Abhishek/Downloads/task-1-dataset/HR_Policy.pdf")

{'md_text': "# **Company Name: IAI Solution** **Policy Title: Employee Reimbursement Policy** **Version: 1.0**\n\n**1. Purpose**\n\nThe purpose of this policy is to outline the guidelines and procedures for the reimbursement of\nexpenses incurred by employees while performing work-related duties. This policy ensures\ntransparency and consistency in the reimbursement process.\n\n\n**2. Scope**\n\nThis policy applies to all employees of IAI Solution who incur expenses in the course of their\nwork duties.\n\n\n**3. Reimbursement Categories**\n\nThe following categories of expenses are eligible for reimbursement under this policy:\n\n\n   - **Food and Beverages**\n\n   - **Travel Expenses**\n\n   - **Accommodations**\n\n**4. General Guidelines**\n\n\n   - All reimbursements must be supported by original receipts and submitted within **30 days**\nof the expense incurred.\n\n   - Employees must complete the reimbursement request form and submit it along with the\nrequired documentation to th

### Extracting 
ZIP file containing one or more employee invoice PDFs.
Employee Name: To link the invoice analysis to a specific employee.


In [9]:
def process_invoices(state, zip_path):
    """Main function: Extract ZIP → Process PDFs → Store in state"""

    # Initialize state
    if "employee_invoice_data" not in state:
        state["employee_invoice_data"] = {}

    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            # Step 1: Extract ZIP and find all PDFs
            pdf_files = extract_zip_and_find_pdfs(zip_path, temp_dir)
            print(f"Found {len(pdf_files)} PDF files")

            # Step 2: Process each PDF
            for pdf_path in pdf_files:
                invoice_data = extract_invoice_data(pdf_path, state)

                if invoice_data:
                    employee_name = get_employee_name(invoice_data)

                    # Step 3: Store in state
                    if employee_name in state["employee_invoice_data"]:
                        state["employee_invoice_data"][employee_name] += "\n\n---\n\n" + invoice_data
                    else:
                        state["employee_invoice_data"][employee_name] = invoice_data

            print(f"Processed {len(state['employee_invoice_data'])} employees")

    except Exception as e:
        print(f"Error: {e}")
        state["employee_invoice_data"]["Error"] = str(e)

    return state

In [10]:
def extract_zip_and_find_pdfs(zip_path, extract_to):
    """Extract ZIP file and return list of PDF file paths"""
    pdf_files = []
    
    # Extract ZIP
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    
    # Find all PDFs (including nested ZIPs)
    for root, dirs, files in os.walk(extract_to):
        for file in files:
            file_path = os.path.join(root, file)
            
            if file.lower().endswith('.pdf'):
                pdf_files.append(file_path)
            
            elif file.lower().endswith('.zip'):
                # Handle nested ZIP
                nested_dir = os.path.join(root, f"nested_{file[:-4]}")
                os.makedirs(nested_dir, exist_ok=True)
                nested_pdfs = extract_zip_and_find_pdfs(file_path, nested_dir)
                pdf_files.extend(nested_pdfs)
    
    return pdf_files

In [11]:
def extract_invoice_data(pdf_path, state):
    """Extract invoice data from PDF using text extraction or vision model"""
    try:
        # Try text extraction first
        text = pymupdf4llm.to_markdown(pdf_path)

        if not text or text.strip() == "":
            # Use vision model if text extraction fails
            text = extract_with_vision(pdf_path, state)
        else:
            # Process extracted text with LLM
            text = process_with_llm(text, state)

        return text

    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
        return ""

In [12]:
def extract_with_vision(pdf_path, state):
    """Use vision model to extract data from PDF images"""
    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")
    doc = fitz.open(pdf_path)
    all_text = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
        img_data = pix.tobytes("png")
        img_base64 = base64.b64encode(img_data).decode()

        # Get the image content first, then process with full prompt
        message = HumanMessage(content=[
            {"type": "text", "text": "Extract all text and details from this invoice image:"},
            {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_base64}"}}
        ])

        response = llm.invoke([message])
        extracted_text = response.content
        
        # Now process with full prompt including status prediction
        full_prompt = get_extraction_prompt(state)
        final_message = HumanMessage(content=f"{full_prompt}\n\nExtracted text:\n\n{extracted_text}")
        final_response = llm.invoke([final_message])
        
        all_text.append(final_response.content)

    doc.close()
    return "\n\n".join(all_text)

In [13]:
def process_with_llm(text, state):
    """Process text-extracted content with LLM for better structure"""
    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

    prompt = get_extraction_prompt(state)
    
    message = HumanMessage(content=f"{prompt}\n\nExtracted text:\n\n{text}")
    response = llm.invoke([message])

    return response.content

def get_extraction_prompt(State):
    """Standard prompt for invoice data extraction"""
    md_text = State.get("md_text", "")
    return f"""Extract invoice information and identify the EMPLOYEE NAME.

EMPLOYEE NAME RULES:
- For MEAL invoices: Look for "Customer Name"
- For TRAVEL invoices: Look for "Passenger Details" 
- For CAB invoices: Look for "Customer Name"
- If no customer/passenger name found: use "No information about employee"

REIMBURSEMENT STATUS ANALYSIS:
Based on the HR reimbursement policy below, analyze the invoice and determine status:

**HR REIMBURSEMENT POLICY:**
{md_text}

**Reimbursement Status Categories:**
- **Fully Reimbursed:** The entire invoice amount is reimbursable according to the HR policy
- **Partially Reimbursed:** Only a portion of the invoice amount is reimbursable according to the HR policy
- **Declined:** The invoice is not reimbursable according to the HR policy

FORMAT:
**EMPLOYEE NAME:** [exact name or "No information about employee"]

**REIMBURSEMENT STATUS:** [**Fully Reimbursed** OR **Partially Reimbursed** OR **Declined**]

**INVOICE DETAILS:**
- Invoice Type: [Meal/Travel/Cab/Accomodation/Other]
- Invoice Number: [if available]
- Date: [date]
- Total Amount: [amount with currency]
- Description: [brief description]
- Reason: What is the reason for this reimbursement?

Return clean markdown format."""

In [14]:
def get_employee_name(invoice_text):
    """Extract employee name from processed invoice text"""
    try:
        lines = invoice_text.split('\n')
        
        for line in lines:
            if '**EMPLOYEE NAME:**' in line:
                name = line.split(':', 1)[1].strip()
                name = name.replace('**', '').replace('*', '').strip()
                
                if name and name != "No information about employee":
                    return name
        
        # Fallback: search for customer patterns
        patterns = [
            r'Customer Name[:\s]+([A-Za-z\s]+)',
            r'Passenger[:\s]+([A-Za-z\s]+)',
            r'Name[:\s]+([A-Za-z\s]+)',
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, invoice_text, re.IGNORECASE)
            if matches:
                name = matches[0].strip()
                if name and len(name) > 1:
                    return name
        
        return "No information about employee"
        
    except Exception as e:
        print(f"Error parsing employee name: {e}")
        return "No information about employee"

In [15]:
def get_employee_invoices(state, employee_name):
    """Get all invoices for a specific employee"""
    if "employee_invoice_data" not in state:
        return "No invoice data available"
    
    # Exact match
    if employee_name in state["employee_invoice_data"]:
        return state["employee_invoice_data"][employee_name]
    
    # Fuzzy match
    for emp_name in state["employee_invoice_data"].keys():
        if employee_name.lower() in emp_name.lower() or emp_name.lower() in employee_name.lower():
            return state["employee_invoice_data"][emp_name]
    
    return f"No invoices found for {employee_name}"

In [16]:
def get_summary(state):
    """Get summary of all employees and their invoices with category and description"""
    if "employee_invoice_data" not in state:
        return {}
    
    summary = {}
    for employee_name, invoice_data in state["employee_invoice_data"].items():
        invoice_count = invoice_data.count("**INVOICE DETAILS:**")
        amounts = re.findall(r'Total Amount[:\s]+[₹$]\s*([0-9,]+\.?\d*)', invoice_data)
        total_amount = sum(float(amt.replace(',', '')) for amt in amounts if amt)
        
        # Get invoice category and description
        category, description = get_invoice_category_and_description(invoice_data)
        status = get_reimbursement_status(invoice_data)
        summary[employee_name] = {
            'invoice_count': invoice_count,
            'invoice_mode': category,
            'Reimbursement_Status': status,
            'description': description
        }
    
    return summary

In [17]:
def get_invoice_category_and_description(invoice_data):
    """Extract invoice category and generate detailed description"""
    try:
        # Get category from invoice type
        category_match = re.search(r'Invoice Type[:\s]+([A-Za-z/]+)', invoice_data, re.IGNORECASE)
        category = category_match.group(1).lower() if category_match else "other"
        
        # Normalize category
        if 'meal' in category or 'food' in category:
            category = 'meal'
        elif 'travel' in category or 'ticket' in category or 'flight' in category or 'train' in category:
            category = 'travel'
        elif 'cab' in category or 'taxi' in category or 'uber' in category or 'ola' in category:
            category = 'cab'
        elif 'hotel' in category or 'house' in category or 'pg' in category or 'hostel' in category:
            category = 'accomodation'
        else:
            category = 'other'
        
        # Generate description using LLM
        description = generate_description_with_llm(invoice_data, category)
        
        return category, description
        
    except Exception as e:
        print(f"Error extracting category: {e}")
        return "other", "Unable to generate description"

In [18]:
def generate_description_with_llm(invoice_data, category):
    """Use LLM to generate category-specific description"""
    try:
        llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")
        
        if category == 'travel':
            prompt = """Based on the following invoice data, provide a SHORT travel description (max 2 lines):
            
Include: Mode of travel, total cost, from which location to where, Date (should stricly match DD/MM/YYYY format), reason of given reimbursement
Format: "Flight from Delhi to Mumbai, total cost ₹5,000, Date is 12/02/22, reason of partially reimbursement is that for traveling cost as per HR Policy we can reimburse only ₹2000 as per 5.2 Travel Expenses " or "Train journey from Chennai to Bangalore, total cost ₹800, Date is 12/3/23, since it is within limit as mentioned in HR Reimbursement Policy hence it is fully reimburse as per 5.2 Travel Expenses ."

Invoice data:
"""
        elif category == 'meal':
            prompt = """Based on the following invoice data, provide a SHORT meal description (max 2 lines):
            
Include: Cuisine/food name, total cost, restaurant name, Date (should stricly match DD/MM/YYYY format), reason of given reimbursement
Format: "North Indian cuisine at Punjabi Dhaba, total cost ₹450, Date is 4/2/25, within HR Policy Budget as per 5.1 Food and Beverages." or "Pizza and beverages at Domino's, total cost ₹600, Date is 23/5/24, it's not with HR Reimbursement policy as given budget by HR is ₹500 but your total cost is ₹600 hence it is partially reimburse as per 5.1 Food and Beverages ."

Include: If Cuisine/food include any wine/wodka/cigrate
Format: "Decline!!! as wine doesn't comes under reimbursement Policy as per 5.1 Food and Beverages."

Invoice data:
"""
        elif category == 'cab':
            prompt = """Based on the following invoice data, provide a SHORT cab description (max 2 lines):
            
Include: Total cost, pickup and drop location if available, Date (should stricly match DD/MM/YYYY format), reason of given reimbursement
Format: "Cab ride from Airport to Hotel, total cost ₹350, Date of travel is 23/2/21, it's more than HR Reimbursement Policy as per 5.2 Travel Expenses hence partially reimburse" or "Uber ride within city, total cost ₹120, Date of travel is 3/01/2002, its within the limit as per 5.2 Travel Expenses hence fully reimburse."

Invoice data:
"""
        elif category == 'accomodation':
            prompt = """Based on the following invoice data, provide a SHORT cab description (max 2 lines):
            
Include: Total cost, hotel name if available, Date (should stricly match DD/MM/YYYY format), reason of given reimbursement
Format: "You stayed in hotel for 2 days, total cost ₹350, Date of travel is 23/2/21, it's more than HR Reimbursement Policy as per 5.3 Accommodation hence partially reimburse" or "You stayed in PG, total cost ₹120, Date of travel is 3/01/2002, its within the limit as per 5.3 Accommodation hence fully reimburse."

Invoice data:
""""""
"""
        else:
            prompt = """Based on the following invoice data, provide a SHORT description (max 2 lines):
            
Include: Service type, total cost,Date (should stricly match DD/MM/YYYY format), brief details
Format: "Service description with cost"

Invoice data:
"""
        
        message = HumanMessage(content=prompt + invoice_data)
        response = llm.invoke([message])
        
        # Clean up the response
        description = response.content.strip()
        # Remove quotes if present
        if description.startswith('"') and description.endswith('"'):
            description = description[1:-1]
        
        return description
        
    except Exception as e:
        print(f"Error generating description: {e}")
        return f"Invoice total with basic details (Error: {str(e)})"

In [19]:
def get_reimbursement_status(invoice_text):
    """Extract reimbursement status from processed invoice text"""
    try:
        lines = invoice_text.split('\n')
        
        for line in lines:
            if '**REIMBURSEMENT STATUS:**' in line:
                status = line.split(':', 1)[1].strip()
                status = status.replace('**', '').replace('*', '').strip()
                
                if status:
                    return status
        
        # Fallback: search for status patterns
        patterns = [
            r'Status[:\s]+([A-Za-z\s*]+)',
            r'Reimbursement[:\s]+([A-Za-z\s*]+)',
        ]
        
        for pattern in patterns:
            matches = re.findall(pattern, invoice_text, re.IGNORECASE)
            if matches:
                status = matches[0].strip()
                if status and len(status) > 1:
                    return status
        
        return "**Pending Review**"  # Default status
        
    except Exception as e:
        print(f"Error parsing reimbursement status: {e}")
        return "**Pending Review**"

In [20]:
import zipfile
import os
import tempfile
import pymupdf4llm
import fitz
import base64
from langchain_core.messages import HumanMessage
import re

In [21]:
# Modified example_usage function to accept state as parameter
def example_usage(state):
    """Example of how to use the code - now accepts state as parameter"""

    state = {"md_text": "", "employee_invoice_data": {}}
    state = extract_hr_policy_from_pdf(state, "C:/Users/Abhishek/Downloads/task-1-dataset/HR_Policy.pdf")

    # Debug: Check if HR policy was extracted
    print("=== HR POLICY EXTRACTION DEBUG ===")
    print(f"HR Policy extracted: {len(state.get('md_text', ''))} characters")
    if state.get('md_text'):
        print(f"HR Policy preview: {state['md_text'][:200]}...")
    else:
        print("WARNING: No HR policy extracted!")
    print("="*50)

    # Step 1: Initialize state if needed (don't create new state)
    if "employee_invoice_data" not in state:
        state["employee_invoice_data"] = {}

    # Step 2: Process ZIP file
    zip_path = "C:/Users/Abhishek/Downloads/dataset.zip"
    state = process_invoices(state, zip_path)

    # Step 3: Get results with category and description
    summary = get_summary(state)
    
    # Step 4: Store summary in state["extract_invoice_data"] (as you wanted)
    state["extract_invoice_data"] = summary
    
    print("Summary with categories and descriptions:")
    for employee, details in summary.items():
        print(f"\n{employee}:")
        print(f"  Invoice Count: {details['invoice_count']}")
        print(f"  Invoice Mode: {details['invoice_mode']}")
        print(f"  Description: {details['description']}")
        print(f"  Reimbursement Status: {details['Reimbursement_Status']}")

    return state

# Alternative: If you want to check what's stored
def check_state_contents(state):
    """Helper function to check what's stored in state"""
    print("=== STATE CONTENTS ===")
    print(f"Available keys: {list(state.keys())}")
    
    # Check extract_invoice_data
    if "extract_invoice_data" in state:
        print(f"\n=== extract_invoice_data ===")
        print(f"Type: {type(state['extract_invoice_data'])}")
        print(f"Number of employees: {len(state['extract_invoice_data'])}")
        
        for employee, details in state["extract_invoice_data"].items():
            print(f"\n{employee}:")
            print(f"  Invoice Count: {details['invoice_count']}")
            print(f"  Invoice Mode: {details['invoice_mode']}")
            print(f"  Description: {details['description']}")
            print(f"  Reimbursement Status: {details['Reimbursement_Status']}") 
    
    # Check raw employee_invoice_data
    if "employee_invoice_data" in state:
        print(f"\n=== employee_invoice_data (raw) ===")
        print(f"Number of employees: {len(state['employee_invoice_data'])}")
        for emp_name in state["employee_invoice_data"].keys():
            print(f"- {emp_name}")

# Main execution
if __name__ == "__main__":
    state = {}
    
    # Process invoices and get results
    state = example_usage(state)
    
    # Optional: Check what's stored
    print("\n" + "="*50)
    check_state_contents(state)
    
    # Optional: Access specific employee data
    print("\n" + "="*50)
    if "extract_invoice_data" in state:
        employee_name = list(state["extract_invoice_data"].keys())[0]  # Get first employee
        print(f"Sample employee data for '{employee_name}':")
        print(state["extract_invoice_data"][employee_name])

=== HR POLICY EXTRACTION DEBUG ===
HR Policy extracted: 2416 characters
HR Policy preview: # **Company Name: IAI Solution** **Policy Title: Employee Reimbursement Policy** **Version: 1.0**

**1. Purpose**

The purpose of this policy is to outline the guidelines and procedures for the reimbu...
Found 6 PDF files
Processed 6 employees
Summary with categories and descriptions:

Rekha:
  Invoice Count: 1
  Invoice Mode: cab
  Description: Daily office cab ride, total cost ₹167, Date of travel is 17/09/2024, it's more than HR Reimbursement Policy as per 5.2 Travel Expenses hence partially reimburse.
  Reimbursement Status: Partially Reimbursed

Seema:
  Invoice Count: 1
  Invoice Mode: cab
  Description: Daily office cab ride, total cost ₹141, Date of travel is 19/09/2024, it's within the company's allowance limit of ₹150 hence fully reimburse.
  Reimbursement Status: Fully Reimbursed

Hardik:
  Invoice Count: 1
  Invoice Mode: meal
  Description: Indian cuisine at Restaurant name not speci

### Let's store employee name and its detail in Pinecone

In [22]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from typing import Dict, List, Optional, TypedDict

In [23]:
INDEX_NAME = "employee-database"

Since we need to add date in meta data 

In [35]:
def extract_date_from_description(description: str) -> Optional[str]:
    """Extract date from description using regex."""
    if not description:
        return None
    
    # Simple regex for DD/MM/YYYY format
    date_pattern = r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b'
    match = re.search(date_pattern, description)
    
    if match:
        day, month, year = match.groups()
        return f"{day.zfill(2)}/{month.zfill(2)}/{year}"
    
    return None

Our main logic here to push employee 1 along with its all relavent detail as one chunk along with employee name and date as meta data and later on we will perform Hybrid search

In [50]:
def process_employees_to_pinecone(employee_invoice_data: Dict[str, Dict[str, str]]):
    """
    Process employee data and add to Pinecone - Simple approach like your example.
    """
    # Initialize embeddings and Pinecone
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    # Setup Pinecone index (create if not exists)
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    # Create index if it doesn't exist
    existing_indexes = [index.name for index in pc.list_indexes()]

    if INDEX_NAME in existing_indexes:
        print(f"🗑️  Deleting existing index: {INDEX_NAME}")
        pc.delete_index(INDEX_NAME)
        
        # Wait for deletion to complete (important!)
        import time
        print("⏳ Waiting for index deletion to complete...")
        while INDEX_NAME in [index.name for index in pc.list_indexes()]:
            time.sleep(1)
        print("✅ Index deletion completed")

    
    print(f"🆕 Creating fresh index: {INDEX_NAME}")
    pc.create_index(
        name=INDEX_NAME,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    
    # Wait for index to be ready
    print("⏳ Waiting for index to be ready...")
    while not pc.Index(INDEX_NAME).describe_index_stats():
        time.sleep(1)
    print("✅ Index is ready")

    # Initialize vector store
    vector_store = PineconeVectorStore(
        index_name=INDEX_NAME,
        embedding=embeddings
    )
    
    # Process each employee - similar to your page_info loop
    all_chunks = []
    
    for employee_name, employee_data in employee_invoice_data.items():
        # Create text content
        text = f"""
        Employee Name: {employee_name}
        Invoice Count: {employee_data.get('invoice_count', 0)}
        Invoice Mode: {employee_data.get('invoice_mode', 'N/A')}
        Reimbursement Status: {employee_data.get('Reimbursement_Status', 'N/A')}
        Description: {employee_data.get('description', 'N/A')}
        """
        
        # Extract date from description
        extracted_date = extract_date_from_description(employee_data.get('description', ''))
        
        # Create document - similar to your Document creation
        doc = Document(
            page_content=text.strip(),
            metadata={
                "employee_name": employee_name,
                "date": extracted_date,
                "document_type": "employee_record",
                "text": text.strip()
            }
        )
        
        all_chunks.append(doc)
    
    # Add all documents to Pinecone
    vector_store.add_documents(all_chunks)
    time.sleep(1)
    print(f"✅ Successfully added {len(all_chunks)} employee records to Pinecone")
    return all_chunks

In [None]:
def search_by_employee_name(employee_name: str, top_k: int = 10):
    """
    Search for documents by employee name using metadata filtering
    
    Args:
        employee_name: Name of employee to search for
        top_k: Number of results to return
    
    Returns:
        List of documents with metadata
    """
    # Initialize embeddings (same as used during indexing)
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    # Initialize vector store
    vector_store = PineconeVectorStore(
        index_name=INDEX_NAME,
        embedding=embeddings
    )
    
    # Create metadata filter for employee name
    metadata_filter = {"employee_name": employee_name}
    
    # Perform search with metadata filter
    # Using a generic query since we're mainly filtering by metadata
    results = vector_store.similarity_search_with_score(
        query="employee record",
        k=top_k,
        filter=metadata_filter
    )
    
    return results

def print_search_results(results):
    """
    Print search results in a readable format
    """
    if not results:
        print("❌ No results found")
        return
    
    print(f"✅ Found {len(results)} results:")
    print("=" * 80)
    
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n📄 RESULT {i}:")
        print(f"   Employee Name: {doc.metadata.get('employee_name', 'N/A')}")
        print(f"   Date: {doc.metadata.get('date', 'N/A')}")
        print(f"   Document Type: {doc.metadata.get('document_type', 'N/A')}")
        print(f"   Similarity Score: {score:.4f}")
        print(f"   Content:")
        print(f"   {doc.page_content}")
        print("-" * 60)

def verify_pinecone_data():
    """
    Verify what data is stored in Pinecone
    """
    try:
        # Initialize Pinecone client
        pc = Pinecone(api_key=PINECONE_API_KEY)
        index = pc.Index(INDEX_NAME)
        
        # Get index stats
        stats = index.describe_index_stats()
        print(f"📊 Index Stats:")
        print(f"   Total Vectors: {stats.total_vector_count}")
        print(f"   Dimension: {stats.dimension}")
        print(f"   Index Fullness: {stats.index_fullness}")
        
        # Test with different employee names
        test_employees = ["Sarah", "John", "Rekha", "Mike"]
        
        for employee in test_employees:
            print(f"\n🔍 Searching for: {employee}")
            results = search_by_employee_name(employee)
            print_search_results(results)
            
    except Exception as e:
        print(f"❌ Error: {str(e)}")

def search_all_employees():
    """
    Get all documents without any filter to see what's in the index
    """
    try:
        # Initialize embeddings and vector store
        embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        vector_store = PineconeVectorStore(
            index_name=INDEX_NAME,
            embedding=embeddings
        )
        
        # Search without any filter to get all documents
        results = vector_store.similarity_search_with_score(
            query="employee",
            k=20  # Get more results to see all employees
        )
        
        print(f"🔍 ALL DOCUMENTS IN INDEX:")
        print_search_results(results)
        
        # Extract unique employee names
        employee_names = set()
        for doc, score in results:
            name = doc.metadata.get('employee_name')
            if name:
                employee_names.add(name)
        
        print(f"\n👥 Unique Employee Names Found: {list(employee_names)}")
        
    except Exception as e:
        print(f"❌ Error searching all employees: {str(e)}")

🚀 Starting Pinecone Data Verification...

STEP 1: Index Stats & Employee Search
📊 Index Stats:
   Total Vectors: 3
   Dimension: 384
   Index Fullness: 0.0

🔍 Searching for: Sarah
✅ Found 1 results:

📄 RESULT 1:
   Employee Name: Sarah
   Date: 20/09/2024
   Document Type: employee_record
   Similarity Score: 0.2105
   Content:
   Employee Name: Sarah
        Invoice Count: 1
        Invoice Mode: travel
        Reimbursement Status: Pending
        Description: Flight booking for business trip, total cost ₹8500, Date of travel is 20/09/2024, awaiting approval.
------------------------------------------------------------

🔍 Searching for: John
✅ Found 1 results:

📄 RESULT 1:
   Employee Name: John
   Date: 18/09/2024
   Document Type: employee_record
   Similarity Score: 0.2822
   Content:
   Employee Name: John
        Invoice Count: 2
        Invoice Mode: food
        Reimbursement Status: Fully Reimbursed
        Description: Team lunch expense, total cost ₹450, Date of travel is 1

In [54]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Initialize Gemini LLM
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

def answer_query_for_employee(employee_name: str, query: str, top_k: int = 5):
    """
    Answer a user query for a specific employee using Pinecone context and Gemini LLM.

    Args:
        employee_name: The name of the employee to search for
        query: The user's question (e.g. reimbursement status)
        top_k: Number of Pinecone results to retrieve

    Returns:
        Answer string from Gemini
    """
    # Step 1: Get relevant documents for the employee
    results = search_by_employee_name(employee_name, top_k=top_k)

    if not results:
        return f"❌ No data found for employee: {employee_name}"

    # Step 2: Concatenate all context from Pinecone
    context = "\n\n".join([doc.page_content for doc, _ in results])

    # Step 3: Define a prompt template
    prompt_template = PromptTemplate(
        template="""
You are an HR assistant AI. Use the following employee information to answer the user's question.

Employee Data:
------------------
{context}

User Question:
------------------
{question}

Give a concise, helpful, and context-grounded answer.
""",
        input_variables=["context", "question"]
    )

    # Step 4: Create LLM Chain
    chain = LLMChain(llm=llm, prompt=prompt_template)

    # Step 5: Run the chain
    response = chain.run({
        "context": context,
        "question": query
    })

    return response


In [56]:
if __name__ == "__main__":
    emp_name = "John"
    user_query = "What are his expenses?"

    answer = answer_query_for_employee(emp_name, user_query)
    print(f"\n🤖 Answer:\n{answer}")


🤖 Answer:
John has 2 food-related expenses, totaling ₹450 for a team lunch on 18/09/2024. These expenses have been fully reimbursed.


In [57]:
if __name__ == "__main__":
    emp_name = "Rekha"
    user_query = "Why I had been partially reimbersued?"

    answer = answer_query_for_employee(emp_name, user_query)
    print(f"\n🤖 Answer:\n{answer}")


🤖 Answer:
Your cab ride cost ₹167, which is more than the amount allowed by the company's reimbursement policy. Therefore, you were partially reimbursed.
