In [10]:
pip install pydantic

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install langchain

Note: you may need to restart the kernel to use updated packages.


In [12]:
### Imports ###
from pydantic import BaseModel, Field
from typing import List
from enum import Enum
from langchain.output_parsers import PydanticOutputParser

In [13]:
'''
Defining a subitem to be used for the includedItems category in the Item object.
The includedItems fields consists of a list of SubItems and can be empty.
For example, a 'cheesburger' Item might have includedItems:['bun', 'patty', 'cheese']
'''
class SubItem(BaseModel):
    description: str

'''
An enumeration of the types of payment methods for the paymentType category in the ReceiptInfo object.
The default in ReceiptInfo is 'cash'.
'''   
class PaymentType(Enum):
    CREDIT = 'credit'
    DEBIT = 'debit'
    CASH = 'cash'

'''
This object represents a single item (good/service) that was purchased in the receipt text.
'''
class Item(BaseModel):
    description: str=Field(default="<UNKNOWN>", description="name")
    predictedDescription: str=Field(default="<UNKNOWN>", description="the human-legible name")
    includedItems: List[SubItem]=Field(default_factory=list)
    quantity: int=Field(default=0, description="number of items")
    unitPrice: float=Field(default=0.00, description="cost per unit")
    totalPrice: float=Field(default=0.00, description="total cost of unit(s) purchased")
    discountAmount: float=Field(default=0.00, description="discount for item")

'''
This object represents the all of the information residing in one receipt text file.
Raw receipt text files are to be parsed into JSON object format for use in later analysis.
'''
class ReceiptInfo(BaseModel):
    merchant: str=Field(description="name of merchant")
    address: str=Field(description="address")
    city: str=Field(description="city")
    state: str=Field(description="state")
    phoneNumber: str=Field(default="<UNKNOWN>", description="phone number")
    receiptDate: str=Field(default="1/01/1991", description="purchase date")
    receiptTime: str=Field(default="00:00:00", description="time purchased")
    totalItems: int=Field(default=0, description="number of items")
    diningOptions: str=Field(default="None", description="here or to-go items for consumable items")
    paymentType: PaymentType=Field(default="cash", description="payment method")
    creditCardType: str=Field(default="<UNKNOWN>", description="credit card type")
    totalDiscount: float=Field(default=0.00, description="total discount")
    tax: float=Field(default=0.00, description="tax amount")
    total: float=Field(default=0.00, description="total amount paid")
    ITEMS: List[Item]
    
# As ITEMS is the only category in any of the classes that doesn't have a default value, it is the only category that is REQUIRED

In [14]:
# Create the Pydantic parser to be used for converting raw receipt text files into JSON objects
receiptParser = PydanticOutputParser(pydantic_object=ReceiptInfo)
print(receiptParser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"Item": {"properties": {"description": {"default": "<UNKNOWN>", "description": "name", "title": "Description", "type": "string"}, "predictedDescription": {"default": "<UNKNOWN>", "description": "the human-legible name", "title": "Predicteddescription", "type": "string"}, "includedItems": {"items": {"$ref": "#/$defs/SubItem"}, "title": "Includeditems", "type": "array"}, "quantity": {"default": 0, "description": "number of items", "title": "Quantity", "type": "integer"}, "unitPrice": {"default": 0.0, "description": "cost per unit", "t

In [15]:
# EXAMPLE 1
item_1 = Item(description="PEDANTIC PEAS", quantity=1, unitPrice=5.00, totalPrice=5.00, discountAmount=0.00)
item_2 = Item(description="CRAZY CARROTS", quantity=1, unitPrice=5.00, totalPrice=5.00, discountAmount=0.00)

receiptInfo = ReceiptInfo(merchant="Walmart", address="123 Manoa Rd", city="Honolulu", state="HI", phoneNumber="1234567890",
                          receiptDate="01/01/1999", receiptTime="12:00am", totalItems=2, paymentType="credit",
                          creditCardType="visa", tax=1.00, total=10.00, totalDiscount=0.01, ITEMS=[item_1, item_2])
print(receiptInfo)

merchant='Walmart' address='123 Manoa Rd' city='Honolulu' state='HI' phoneNumber='1234567890' receiptDate='01/01/1999' receiptTime='12:00am' totalItems=2 diningOptions='None' paymentType=<PaymentType.CREDIT: 'credit'> creditCardType='visa' totalDiscount=0.01 tax=1.0 total=10.0 ITEMS=[Item(description='PEDANTIC PEAS', predictedDescription='<UNKNOWN>', includedItems=[], quantity=1, unitPrice=5.0, totalPrice=5.0, discountAmount=0.0), Item(description='CRAZY CARROTS', predictedDescription='<UNKNOWN>', includedItems=[], quantity=1, unitPrice=5.0, totalPrice=5.0, discountAmount=0.0)]


In [16]:
# EXAMPLE 2
item_1 = Item(description="SY RAMEN HOT MULTI", quantity=1, unitPrice=8.99, totalPrice=8.99, discountAmount=0.00)
item_2 = Item(description="IND MI GORENG NOOD", quantity=1, unitPrice=4.99, totalPrice=4.99, discountAmount=0.00)
item_3 = Item(description="HT BEEF DUMPLING D", quantity=1, unitPrice=19.99, totalPrice=19.99, discountAmount=0.00)
item_4 = Item(description="NS SHRIMP HOT SNCK", quantity=1, unitPrice=1.99, totalPrice=1.99, discountAmount=0.00)
item_5 = Item(description="ME AZUKI ICE CREAM", quantity=1, unitPrice=6.49, totalPrice=6.49, discountAmount=0.00)

receiptInfo = ReceiptInfo(merchant="H MART", address="458 Keawe st", city="Honolulu", state="<UNKNOWN>", 
                          phoneNumber="<UNKNOWN>", receiptDate="07/15/23", receiptTime="08:16pm", totalItems=5, 
                          paymentType="credit", creditCardType="Discover", tax=2.00, total=44.45, totalDiscount=0.00, 
                          ITEMS=[item_1, item_2, item_3, item_4, item_5])
print(receiptInfo)

merchant='H MART' address='458 Keawe st' city='Honolulu' state='<UNKNOWN>' phoneNumber='<UNKNOWN>' receiptDate='07/15/23' receiptTime='08:16pm' totalItems=5 diningOptions='None' paymentType=<PaymentType.CREDIT: 'credit'> creditCardType='Discover' totalDiscount=0.0 tax=2.0 total=44.45 ITEMS=[Item(description='SY RAMEN HOT MULTI', predictedDescription='<UNKNOWN>', includedItems=[], quantity=1, unitPrice=8.99, totalPrice=8.99, discountAmount=0.0), Item(description='IND MI GORENG NOOD', predictedDescription='<UNKNOWN>', includedItems=[], quantity=1, unitPrice=4.99, totalPrice=4.99, discountAmount=0.0), Item(description='HT BEEF DUMPLING D', predictedDescription='<UNKNOWN>', includedItems=[], quantity=1, unitPrice=19.99, totalPrice=19.99, discountAmount=0.0), Item(description='NS SHRIMP HOT SNCK', predictedDescription='<UNKNOWN>', includedItems=[], quantity=1, unitPrice=1.99, totalPrice=1.99, discountAmount=0.0), Item(description='ME AZUKI ICE CREAM', predictedDescription='<UNKNOWN>', inclu