In [33]:
from datetime import datetime
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain_openai import ChatOpenAI
from typing import List
from pydantic import BaseModel, Field
from langchain_community.utils.openai_functions import convert_pydantic_to_openai_function
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
load_dotenv()

class Product(BaseModel):
    """Information about a Product."""
    Name: str = Field(None,description="Product Name of a product")
    Price: float = Field(None,description="Total amount of the product")
    
class Extraction_with_products(BaseModel):
    """ Extract the entities from the invoice document"""
    transaction_id :str =Field(None,description="Payment Transaction ID of the invoice document file")
    amount : float = Field(None,description="Total Invoice value of the invoice document file")
    datetime_field: datetime = Field(None,description="The date and time (24-hour format) when the payment was made.")
    mode_of_payment :str=Field(None,description="Tells about the mode of payment the user proceeeded")
    Products:List[Product] =Field(None,description="Tells about the details of list of products")

class Extraction(BaseModel):
    """ Extract the entities from the invoice document"""
    transaction_id :str =Field(None,description="Payment Transaction ID of the invoice document file")
    amount : float = Field(None,description="Total Invoice value of the invoice document file")
    datetime_field: datetime = Field(None,description="The date and time (24-hour format) when the payment was made.")
    mode_of_payment :str=Field(None,description="Tells about the mode of payment the user proceeeded") 
    
     
def extract_features(document):


    model = ChatOpenAI(temperature=0)
    function = [convert_pydantic_to_openai_function(Extraction)]
    prompt_template="""You are tasked with extracting specific fields from an invoice document. The fields you need to extract are:

    1. Transaction ID
    2. Amount
    3. Date and Time (combined)
    4. Mode of Payment

    Please extract the following fields from the given invoice document. If any field cannot be extracted, set its value to `null`. 

    Invoice Document :{Document}
    """
    prompt_template = ChatPromptTemplate.from_template(prompt_template)
    llm_model = (
        prompt_template
        | model.bind(functions=function) 
        | JsonOutputFunctionsParser() 
    )
    return llm_model.invoke({
        "Document":document
    })
    
def extract_features_with_products(document):


    model = ChatOpenAI(temperature=0)
    function = [convert_pydantic_to_openai_function(Extraction_with_products)]
    prompt_template="""You are tasked with extracting specific fields from an invoice document. The fields you need to extract are:

1. Transaction ID
2. Amount
3. Date and Time (combined)
4. Mode of Payment
5. 5. List of Products (each product has a Name and Amount)

Please extract the following fields from the given invoice document. If any field cannot be extracted, set its value to `null`. 

Invoice Document :{Document}
"""
    prompt_template = ChatPromptTemplate.from_template(prompt_template)
    llm_model = (
        prompt_template
        | model.bind(functions=function) 
        | JsonOutputFunctionsParser() 
    )
    return llm_model.invoke({
        "Document":document
    })

In [34]:

from langchain_community.document_loaders import PyPDFLoader
from datetime import datetime
loader = PyPDFLoader("data/indigo flight ticket .pdf")
pages = loader.load_and_split()



In [35]:
pages

[Document(metadata={'source': 'data/indigo flight ticket .pdf', 'page': 0}, page_content='Your IndiGo Itinerary - WMRKGV\nFrom: IndiGo\nreser vations@cust omer .goindigo.in\nTo: slsnar ayanan@outlook.com\nslsnar ayanan@outlook.com\nSent:  Friday, Februar y 17, 15:11\nPNR/Booking Ref.: WMRK GV\nStatus Date of Booking* Payment Status\nCONFIRMED 17Feb23 09:41:25 (UT C) Approved\n*Booking Date reﬂects in UTC (Universal Time Coordinated), all other timings mentioned are as per Local Time.\nIndiGo P assenger - 1/1 Check-in now Flight Status\nMr. Lakshmi nar ayanan Sar avana perumal\nIndiGo Flight(s)\nDate From (T erminal) Depar tsFlight Number\n(Aircraft type)Check-in/Bag\ndrop closesTo (Terminal) Arriv es Via\n15 Mar 23 Hyderabad 12:056E 6781 \xa0\n(A321)11:05 Madur ai (T1) 13:40'),
 Document(metadata={'source': 'data/indigo flight ticket .pdf', 'page': 1}, page_content='HYD\n IXM\xa0\n\xa0\nTips for a hassle-fr ee tr avel experience\nFree mandat ory web\ncheck-in\nCheck-in online for fr ee

In [36]:
extract_features(pages)

{'transaction_id': 'WMRKGV',
 'amount': 4035,
 'datetime_field': '2023-02-17T09:41:25',
 'mode_of_payment': None}

In [37]:
extract_features_with_products(pages)

{'transaction_id': 'WMRKGV',
 'amount': 4035,
 'datetime_field': '2023-02-17T09:41:25',
 'mode_of_payment': None,
 'Products': [{'Name': 'Airfare Charges', 'Price': 3079},
  {'Name': 'Aviation Security Fee', 'Price': 236},
  {'Name': 'GST for Telangana', 'Price': 154},
  {'Name': 'User Development Fee', 'Price': 566}]}

# image

In [39]:
%pip install --upgrade --quiet  pdfminer

Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain_community.document_loaders.image import UnstructuredImageLoader
loader = UnstructuredImageLoader("data/WhatsApp Image 2024-07-14 at 16.59.42_568f7130.jpg")
data = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

In [None]:
data[0]