In [4]:
pip install langchain langgraph langchain-community -q -U google-generativeai

Note: you may need to restart the kernel to use updated packages.


In [26]:
from typing import TypedDict, List, Optional, Annotated
from langgraph.graph import StateGraph, END
from langchain_community.document_loaders import PyPDFLoader
import google.generativeai as genai
import os
import json

In [28]:
from langchain_community.document_loaders import PyPDFLoader

def extract_pdf_text(pdf_path):
  
    # Initialize the loader with the provided PDF path
    loader = PyPDFLoader(pdf_path)

    # Load and split the PDF into individual pages
    documents = loader.load_and_split()

    # Combine all page contents into a single string
    full_text = ""
    for doc in documents:
        full_text += doc.page_content + "\n"  # Add a newline for separation between pages

    return full_text

# Example usage
pdf_file_path = "/kaggle/input/biodata/Biodata_sample2.pdf"
extracted_text = extract_pdf_text(pdf_file_path)
print(extracted_text)

BIO DATA 
 
 
Name    : Jessica Parker 
 
     
Father’s Name   : William Parker     
Mobile    : +1-555-123-4567     
Email id   : jessica.parker@example.com    
    
Date of Birth   : 12th February 1988     
Marital Status   : Married     
Religion   : Christianity      
  
Languages Known  :  English, French, German     
Educational Background : 
Qualification Name of the University Year of Passing Total % Marks 
10th Hillside High School 2004 92% 
Intermediate Central City College 2006 89% 
Under Graduation University of California 2010 86% 
Post Graduation Stanford University 2012 88% 
      
Work Experience  :    
Organization Designation Job Period Responsibilities 
     
     
     
     
   
Address: 123  
Place: Los Angeles 
Date: 22nd August 2024    
Signature. 
 
 
Affix Passport 
Size Photo



In [29]:
# Define state schema
class EmploymentEntry(TypedDict):
    first_name: str
    last_name: str
    employer: str
    designation: str
    service_duration: str
    total_earnings: Optional[str]

class AgentState(TypedDict):
    employment_entries: List[EmploymentEntry]
    missing_earnings_entries: List[EmploymentEntry]

In [30]:
# Initialize clients
genai.configure(api_key="AIzaSyBLGkrsWfh04iCyuJOR03Wd8cCqqesOcxY")
gemini = genai.GenerativeModel("gemini-1.5-flash")


def process_master_document(state: AgentState) -> AgentState:
    """Step 1: Process master document and extract entities"""
    content = extract_pdf_text("/kaggle/input/payslip/Payslip_tech_consul.pdf")
    
    prompt = f"""Extract employment details from this document. Look for:
    - First name
    - Last name
    - Employer/Company Name (may be multiple)
    - Designation
    - Service Duration
    - Total Earnings (if available)
    
    Return JSON format with schema: {{
        "employments": [
            {{
                "first_name": string|None,
                "last_name": string|None,
                "employer": string|None,
                "designation": string|None,
                "service_duration": string|None,
                "total_earnings": string|None
            }}
        ]
    }}
    
    Document content: {content}"""
    
    response = gemini.generate_content(prompt)
    import ast
    response= ast.literal_eval(response.text.replace("`" ,"").replace("json","").replace("\n","").replace("\t","").replace("null","None"))
    for employment in response['employments']:
        if employment['total_earnings'] == "7,500.00":
            employment['total_earnings'] = None
            #to test the supporting document flow 
        

    
    # Step 3: Format the output as needed
    entries = response.get("employments", [])
    
    missing_earnings = [entry for entry in entries if not entry.get("total_earnings")]
    
    return {
        "employment_entries": entries,
        "missing_earnings_entries": missing_earnings
    }

1) Fix the pdfreader
2) Fix gemini call
3) Format gemini output
4) Perplexity search how to loop in a folder
5) 

In [41]:
def process_supporting_docs(state: AgentState) -> AgentState:
    """Step 2a: Search supporting documents for missing earnings"""
    updated_entries = state["employment_entries"].copy()
    directory = '/kaggle/input/supporting-doc'
    
    for entry in state["missing_earnings_entries"]:
        # Search supporting documents
        print(entry)   
        
        for file in os.listdir(directory): 
            if file.endswith('.pdf') and os.path.isfile(os.path.join(directory, file)):
                file_path=os.path.abspath(os.path.join(directory, file))
                print(file_path)
                supporting_info = ""
                content = extract_pdf_text(file_path)
                supporting_info += f"content:\n{content}"
        
                prompt = f"""Given these employment details:
                Employer: {entry['employer']}
                Designation: {entry['designation']}
                Service Duration: {entry['service_duration']}
                
                And these supporting documents:
                {supporting_info}
                
                Calculate or extract the total earnings. Return ONLY the numerical value.
                If total earning is not found return None."""
                
                earnings = gemini.generate_content(prompt).text
                if earnings!="None":
                    print(earnings)
                    entry["total_earnings"] = earnings
                    break 
            
    # Update state
    return {
        "employment_entries": updated_entries,
        "missing_earnings_entries": [
            e for e in updated_entries if not e.get("total_earnings")
        ]
    }

In [42]:
# Specify the directory you want to loop through
directory = '/kaggle/input/supporting-doc'
for file in os.listdir(directory): 
    if file.endswith('.pdf') and os.path.isfile(os.path.join(directory, file)):
        file_path=os.path.abspath(os.path.join(directory, file))
        print(file_path)

/kaggle/input/supporting-doc/Payslip_tech_consul.pdf


In [43]:
def estimate_earnings(state: AgentState) -> AgentState:
    """Step 2b: Estimate missing earnings from internet data"""
    updated_entries = state["employment_entries"].copy()
    
    for entry in state["missing_earnings_entries"]:
        prompt = f"""Estimate annual compensation for:
        Position: {entry['designation']}
        Company: {entry['employer']}
        Service Duration: {entry['service_duration']}
        
        Return approximate total earnings for the service period as a single number."""
        
        estimation = gemini.generate(prompt)
        entry["total_earnings"] = f"Estimated {estimation}"
    
    return {"employment_entries": updated_entries}

In [44]:
def should_continue(state: AgentState) -> str:
    """Conditional edge decision"""
    if len(state["missing_earnings_entries"]) > 0:
        return "process_supporting_docs"
    return "end"

In [45]:
# Build the workflow
workflow = StateGraph(AgentState)

In [46]:
# Add nodes
workflow.add_node("process_master", process_master_document)
workflow.add_node("process_supporting_docs", process_supporting_docs)
workflow.add_node("estimate_earnings", estimate_earnings)

<langgraph.graph.state.StateGraph at 0x79687fec9ea0>

In [47]:
# Set entry point
workflow.set_entry_point("process_master")

<langgraph.graph.state.StateGraph at 0x79687fec9ea0>

In [48]:
# Add conditional edges
workflow.add_conditional_edges(
    "process_master",
    lambda state: "process_supporting_docs" if state["missing_earnings_entries"] else END
)

workflow.add_conditional_edges(
    "process_supporting_docs",
    lambda state: "estimate_earnings" if state["missing_earnings_entries"] else END
)

workflow.add_edge("estimate_earnings", END)

<langgraph.graph.state.StateGraph at 0x79687fec9ea0>

In [49]:
# Compile the graph
app = workflow.compile()

In [50]:
# Execute the workflow
final_state = app.invoke({"employment_entries": [], "missing_earnings_entries": []})
print(final_state["employment_entries"])

{'first_name': 'John', 'last_name': 'Doe', 'employer': 'Accenture Pvt. Ltd.', 'designation': 'Technical Consultant', 'service_duration': None, 'total_earnings': None}
/kaggle/input/supporting-doc/Payslip_tech_consul.pdf
7500

[{'first_name': 'John', 'last_name': 'Doe', 'employer': 'Accenture Pvt. Ltd.', 'designation': 'Technical Consultant', 'service_duration': None, 'total_earnings': '7500\n'}]


In [51]:
s= """
```json
{
  "employments": [
    {
      "first_name": "John",
      "last_name": "Doe",
      "employer": "Accenture Pvt. Ltd.",
      "designation": "Technical Consultant",
      "service_duration": None,
      "total_earnings": "7,500.00"
    }
  ]
}
```
"""

In [162]:
print(s.replace("`" ,"").replace("json","").replace("\n","").replace("\t",""))

{  "employments": [    {      "first_name": "John",      "last_name": "Doe",      "employer": "Accenture Pvt. Ltd.",      "designation": "Technical Consultant",      "service_duration": None,      "total_earnings": "7,500.00"    }  ]}


In [163]:
import ast
x= ast.literal_eval(s.replace("`" ,"").replace("json","").replace("\n","").replace("\t",""))

In [164]:
x

{'employments': [{'first_name': 'John',
   'last_name': 'Doe',
   'employer': 'Accenture Pvt. Ltd.',
   'designation': 'Technical Consultant',
   'service_duration': None,
   'total_earnings': '7,500.00'}]}

In [215]:
for employment in x['employments']:
    if employment['total_earnings'] == "7,500.00":
        employment['total_earnings'] = None

In [216]:
x

{'employments': [{'first_name': 'John',
   'last_name': 'Doe',
   'employer': 'Accenture Pvt. Ltd.',
   'designation': 'Technical Consultant',
   'service_duration': None,
   'total_earnings': None}]}