In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
import google.generativeai as genai
from langchain.schema.output_parser import StrOutputParser

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Retrieve the Google API key from environment variables
google_api_key = os.getenv("GOOGLE_API_KEY")

# Check if the API key is set
if google_api_key is None:
    raise ValueError("GOOGLE_API_KEY environment variable is not set")

In [3]:
# Load and process the PDF 
file_path = "sample/sample-invoice.pdf" #Upload your file here
loader = PyPDFLoader(file_path)
docs = loader.load()

In [4]:
print(f"Number of documents: {len(docs)}")
print(f"First 100 characters of first document: {docs[0].page_content[:100]}")
print(f"Metadata of first document: {docs[0].metadata}")

Number of documents: 1
First 100 characters of first document:  Tax Invoice/Bill of Supply/Cash Memo
(Original for Recipient)
*ASSPL-Amazon Seller Services Pvt. Lt
Metadata of first document: {'source': 'invoice (1).pdf', 'page': 0}


In [5]:
# Set up the embeddings and vectorstore
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
retriever = vectorstore.as_retriever()

I0000 00:00:1723119948.464706  170233 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache
I0000 00:00:1723119948.474559  170233 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [6]:
# Define the knowledge base with invoice-related information
knowledge_base = {
    "invoice_number": "A unique identifier assigned to each invoice for tracking and reference purposes.",
    "order_number": "A unique identifier assigned to the customer's order.",
    "order_date": "The date when the order was placed by the customer.",
    "invoice_date": "The date when the invoice was generated.",
    "seller": {
        "name": "The name of the business selling the product or service.",
        "pan_number": "The seller's Permanent Account Number (PAN) for tax purposes.",
        "gst_number": "The seller's Goods and Services Tax (GST) number.",
        "address": "The physical address of the seller."
    },
    "billing_address": "The address provided by the customer for billing purposes.",
    "shipping_address": "The address where the goods are to be delivered.",
    "place_of_supply": "The state or union territory where the goods are supplied.",
    "item_details": {
        "description": "The description of the item purchased.",
        "code_number": "The code number (HSN) associated with the item.",
        "hsn_number": "HSN stands for Harmonized System of Nomenclature. It is a globally standardized system of names and numbers used to classify traded products.",
        "unit_price": "The price per unit of the item.",
        "quantity": "The number of units purchased.",
        "total_price": "The total cost of the item before taxes.",
        "tax_rate": "The percentage of tax applied.",
        "tax_amount": "The amount of tax charged on the item.",
        "final_amount": "The final amount payable for the item including tax."
    },
    "shipping_charges": "The cost of shipping the item, after any discounts.",
    "reverse_charge": "Indicates whether tax is payable under reverse charge.",
    "amount_in_words": "The total amount due, expressed in words."
}

In [7]:
# Define the system prompt used for generating responses
system_prompt = """
Context: {context}

You are a document analysis assistant specializing in invoices. Your primary task is to extract accurate and relevant information directly from the provided invoice.

Focus solely on the invoice content to answer the user's question. Do not reference or utilize any external knowledge or information beyond what is explicitly stated within the invoice. 

**Provide answers in a clear and concise format, using bullet points or numbered lists where appropriate.**

If the user's question requires calculations, comparisons, or summaries based on invoice data, provide a quantitative response.

If the user's question is vague or requests general information about invoices, provide definitions, explanations, or examples related to invoice components.

User Question: {question}
"""


In [8]:
# Function to generate answers using LLM
def llm_ans(chat_input, history, knowledge_base):
    global system_prompt
    prompt2 = f"""{system_prompt}\n {history}"""
    prompt = ChatPromptTemplate.from_template(prompt2)  

    llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0)
    chain = (
        {"context": retriever, "knowledge_base": lambda _: str(knowledge_base), "question": lambda x: x}
        | prompt
        | llm
        | StrOutputParser()
    )
    return chain.invoke(chat_input)


In [9]:
def get_user_question():
    return input("Please enter your question about the invoice: ")

def process_question(question, history, knowledge_base):
    try:
        response = llm_ans(question, history, knowledge_base)
        return response
    except Exception as e:
        return f"An error occurred while processing your question: {str(e)}"

In [None]:
def main():
    print("Welcome to the Invoice Assistant. You can ask questions about invoice structure and terms.")
    print("Type 'quit', 'exit', or 'bye' to end the session.\n")

    chat_history = []

    while True:
        question = get_user_question()
        if question.lower() in ['quit', 'exit', 'bye']:
            print("Thank you for using the Invoice Assistant. Goodbye!")
            break
        chat_history.append(f"User: {question}")
        response = process_question(question, chat_history, knowledge_base)
        chat_history.append(f"Assistant: {response}")
        print(response)

if __name__ == "__main__":
    main()