In [1]:
from dotenv import load_dotenv
from rich import print

In [2]:
load_dotenv()

True

In [14]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)

chat_model = ChatHuggingFace(llm=llm)

In [4]:
from langchain_core.messages import (
    HumanMessage,
    SystemMessage,
)

messages = [
    SystemMessage(content="You're a helpful assistant"),
    HumanMessage(
        content="Wnat is capital of india?"
    ),
]

ai_msg = chat_model.invoke(messages)

In [5]:
print(ai_msg.content)

The current capital city of India is New Delhi. It became the capital after India gained independence from British rule in 1947. New Delhi is a territory and union capital, administered as a national capital territory, and serves as the seat of the federal government and legislature of India.

Note: Before India's independence, the British colonial capital was Calcutta (now known as Kolkata in India), and the Legislative Council of British India functioned in Kolkata from 1911 until 1937. And Dhaka (now the capital of Bangladesh) was the capital of Pakistan (a country that came into existence right after the partition of India) from 1947 to 1958, when its capital moved to Islamabad.


### testing document


In [12]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain.schema import HumanMessage
import json

# Initialize chat model
llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)
chat_model = ChatHuggingFace(llm=llm)

# Load and split PDF
loader = PyPDFLoader("./data/invoice_Aaron Bergman_36259.pdf")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(documents)

# Define the custom extraction prompt
prompt_template = """
You are an expert in field extraction from invoice bills.
Extract the following fields from the given text and return them in the output format shown below:

Fields to extract:
- Invoice Date
- Bill To
- Ship To

Text:
{text}

Output Format (as JSON):
{{
  "entities": [
    {{
      "entity": "Invoice Date",
      "text": "20-dec-2025"
    }},
    {{
      "entity": "Bill To",
      "text": "James Mormount"
    }},
    {{
      "entity": "Ship To",
      "text": "Winterfell"
    }}
  ]
}}

ONLY return a valid JSON object that matches the schema above.
"""

# Process chunks
for i, chunk in enumerate(chunks[:2]):  # limit for demo
    prompt = prompt_template.format(text=chunk.page_content)
    response = chat_model([HumanMessage(content=prompt)])
    
    print(f"\n--- Extracted Info from Chunk {i+1} ---")
    print(response.content)

    try:
        data = json.loads(response.content)
        print("✅ Parsed as Python dict:", data)
    except json.JSONDecodeError:
        print("⚠️ Response not in strict JSON format. Review model output.")


In [13]:
print(response.content)

In [22]:
data['entities']

[{'entity': 'Invoice Date', 'text': 'Mar 06 2012'},
 {'entity': 'Bill To', 'text': 'Aaron Bergman'},
 {'entity': 'Ship To', 'text': '98103, Seattle, Washington, United States'}]

In [23]:
for d in data['entities']:
    print(d)

### prompt generation

In [24]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
from langchain.schema import HumanMessage
import json

# Initialize chat model
llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
)
chat_model = ChatHuggingFace(llm=llm)

# Load and split PDF
loader = PyPDFLoader("./data/invoice_Aaron Bergman_36259.pdf")
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(documents)

# Define the custom extraction prompt
prompt_template = """
You are an expert in writting prompt for custom NER extraction for invoice bills.
write a generic prompt for fields names provided below which you will find in document:

Field name to write prompt for:
- Invoice Date
- Bill To
- Ship To

Text:
{text}

Output Format (as JSON):
{{
  "entities": [
    {{
      "entity": "Invoice Date",
      "text": "Invoice date is date when invoie is generated typically mentioned at top of document "
    }},
  ]
}}

ONLY return a valid JSON object that matches the schema above.
"""

# Process chunks
for i, chunk in enumerate(chunks[:2]):  # limit for demo
    prompt = prompt_template.format(text=chunk.page_content)
    response = chat_model([HumanMessage(content=prompt)])
    
    print(f"\n--- Extracted Info from Chunk {i+1} ---")
    print(response.content)

    try:
        data = json.loads(response.content)
        print("✅ Parsed as Python dict:", data)
    except json.JSONDecodeError:
        print("⚠️ Response not in strict JSON format. Review model output.")


In [None]:
1. create jinja template and move prompt there 
2. load that jinja into variable 
3. run exatract