In [3]:
receipt_text = """Marley's Shop
123 Long Rd
Kailua, HI 67530
(808) 555-1234
CASHIER: JOHN
REGISTER #: 6
04/12/2023
Transaction ID: 5769009
PRICE   QTY  TOTAL
APPLES (1 lb)
2.99 2 5.98  1001
-1.00  999
Choco Dream Cookies
7.59 1 7.59   1001
SUBTOTAL
13.57
SALES TAX 8.5%
1.15
TOTAL
-14.72
VISA CARD            14.72
CARD#: ************1234
REFERENCE#: 6789
THANK YOU FOR SHOPPING WITH US!
"""

In [4]:
entities  = """
{
  "ReceiptInfo": {
    "merchant": "Marley's Shop",
    "address": "123 Long Rd",
    "city": "Kailua",
    "state": "HI",
    "phoneNumber": "(808) 555-1234",
    "tax": 1.15,
    "total": 14.72,
    "receiptDate": "04/12/2023",
    "ITEMS": [
      {
        "description": "APPLES (1 lb)",
        "quantity": 2,
        "unitPrice": 2.99,
        "totalPrice": 5.98,
        "discountAmount": 1.00
      },
      {
        "description": "Choco Dream Cookies",
        "quantity": 1,
        "unitPrice": 7.59,
        "totalPrice": 7.59
      }
    ]
  }
}
"""

In [5]:
type(entities)

str

In [8]:
type(json.loads(entities))

dict

In [11]:
import json

def find_span(text, entity_text):
    start = text.find(entity_text)
    if start == -1:
        return None
    end = start + len(entity_text)
    return (start, end), entity_text

def convert_to_prodigy_spans(receipt_text, entities):
    text_vals = []
    entities = json.loads(entities)
    prodigy_data = []
    receipt_info = entities["ReceiptInfo"]

    for label, entity_text in [
        ("MERCHANT", receipt_info["merchant"]),
        ("ADDRESS", receipt_info["address"]),
        ("CITY", receipt_info["city"]),
        ("STATE", receipt_info["state"]),
        ("PHONE", receipt_info["phoneNumber"]),
        ("TAX", str(receipt_info["tax"])),
        ("TOTAL", str(receipt_info["total"])),
        ("DATE", receipt_info["receiptDate"])
    ]:
        span, text = find_span(receipt_text, entity_text)
        text_vals.append(text)
        if span:
            start, end = span
            prodigy_data.append({"start": start, "end": end, "label": label})

    # Process item-level entities
    for item in receipt_info["ITEMS"]:
        for label, entity_text in [
            ("ITEM_DESC", item["description"]),
            ("QTY", str(item["quantity"])),
            ("UNIT_PRICE", str(item["unitPrice"])),
            ("TOTAL_PRICE", str(item["totalPrice"])),
            ("DISCOUNT", str(item.get("discountAmount", "")))  # Discount might not always be present
        ]:
            if entity_text:  # Check if the entity text is not empty
                span = find_span(receipt_text, entity_text)
                if span:
                    start, end = span
                    prodigy_data.append({"start": start, "end": end, "label": label})

    return prodigy_data, text_vals

# Example usage
prodigy_spans_true, text_vals = convert_to_prodigy_spans(receipt_text, entities)
print(json.dumps(prodigy_spans_true, indent=2))
print(text_vals)

[
  {
    "start": 0,
    "end": 13,
    "label": "MERCHANT"
  },
  {
    "start": 14,
    "end": 25,
    "label": "ADDRESS"
  },
  {
    "start": 26,
    "end": 32,
    "label": "CITY"
  },
  {
    "start": 34,
    "end": 36,
    "label": "STATE"
  },
  {
    "start": 43,
    "end": 57,
    "label": "PHONE"
  },
  {
    "start": 252,
    "end": 256,
    "label": "TAX"
  },
  {
    "start": 264,
    "end": 269,
    "label": "TOTAL"
  },
  {
    "start": 86,
    "end": 96,
    "label": "DATE"
  },
  {
    "start": [
      140,
      153
    ],
    "end": "APPLES (1 lb)",
    "label": "ITEM_DESC"
  },
  {
    "start": [
      15,
      16
    ],
    "end": "2",
    "label": "QTY"
  },
  {
    "start": [
      154,
      158
    ],
    "end": "2.99",
    "label": "UNIT_PRICE"
  },
  {
    "start": [
      161,
      165
    ],
    "end": "5.98",
    "label": "TOTAL_PRICE"
  },
  {
    "start": [
      173,
      176
    ],
    "end": "1.0",
    "label": "DISCOUNT"
  },
  {
    "start": [


In [12]:
s = prodigy_spans_true[2]['start']
e = prodigy_spans_true[2]['end']
receipt_text[s:e]

'Kailua'

In [13]:
text_vals[4]

'(808) 555-1234'

In [14]:
ner_predicted = """{
  "ReceiptInfo": {
    "merchant": "Marley's",
    "address": "123 Long Rd",
    "city": "Kailua",
    "state": "HI",
    "phoneNumber": "555-1234",
    "tax": 1.15,
    "total": 14.72,
    "receiptDate": "04/12/2023",
    "ITEMS": [
      {
        "description": "APPLES (1 lb)",
        "quantity": 2,
        "unitPrice": 2.99,
        "totalPrice": 5.98,
        "discountAmount": 1.00
      },
      {
        "description": "Choco Dream Cookies",
        "quantity": 1,
        "unitPrice": 7.59,
        "totalPrice": 7.59
      }
    ]
  }
}"""

prodigy_spans_predicted, text_vals_predicted = convert_to_prodigy_spans(receipt_text, ner_predicted)
print(json.dumps(prodigy_spans_predicted, indent=2))
print(text_vals_predicted)

[
  {
    "start": 0,
    "end": 8,
    "label": "MERCHANT"
  },
  {
    "start": 14,
    "end": 25,
    "label": "ADDRESS"
  },
  {
    "start": 26,
    "end": 32,
    "label": "CITY"
  },
  {
    "start": 34,
    "end": 36,
    "label": "STATE"
  },
  {
    "start": 49,
    "end": 57,
    "label": "PHONE"
  },
  {
    "start": 252,
    "end": 256,
    "label": "TAX"
  },
  {
    "start": 264,
    "end": 269,
    "label": "TOTAL"
  },
  {
    "start": 86,
    "end": 96,
    "label": "DATE"
  },
  {
    "start": [
      140,
      153
    ],
    "end": "APPLES (1 lb)",
    "label": "ITEM_DESC"
  },
  {
    "start": [
      15,
      16
    ],
    "end": "2",
    "label": "QTY"
  },
  {
    "start": [
      154,
      158
    ],
    "end": "2.99",
    "label": "UNIT_PRICE"
  },
  {
    "start": [
      161,
      165
    ],
    "end": "5.98",
    "label": "TOTAL_PRICE"
  },
  {
    "start": [
      173,
      176
    ],
    "end": "1.0",
    "label": "DISCOUNT"
  },
  {
    "start": [
 

### THIS IS FROM THE DOCUMENTATION

In [16]:
"https://pypi.org/project/nervaluate/#:~:text=tab%20delimited%20strings.-,Prodigy%20spans,-true%20%3D%20%5B%0A%20%20%20%20%5B%7B%22label%22%3A%20%22PER"

true = [
    [{"label": "PER", "start": 2, "end": 4}],
    [{"label": "LOC", "start": 1, "end": 2},
     {"label": "LOC", "start": 3, "end": 4}]
]

pred = [
    [{"label": "PER", "start": 2, "end": 4}],
    [{"label": "LOC", "start": 1, "end": 2},
     {"label": "LOC", "start": 3, "end": 4}]
]

from nervaluate import Evaluator

evaluator = Evaluator(true, pred, tags=['LOC', 'PER'])

# Returns overall metrics and metrics for each tag

results, results_per_tag = evaluator.evaluate()

results

{'ent_type': {'correct': 3,
  'incorrect': 0,
  'partial': 0,
  'missed': 0,
  'spurious': 0,
  'possible': 3,
  'actual': 3,
  'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0},
 'partial': {'correct': 3,
  'incorrect': 0,
  'partial': 0,
  'missed': 0,
  'spurious': 0,
  'possible': 3,
  'actual': 3,
  'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0},
 'strict': {'correct': 3,
  'incorrect': 0,
  'partial': 0,
  'missed': 0,
  'spurious': 0,
  'possible': 3,
  'actual': 3,
  'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0},
 'exact': {'correct': 3,
  'incorrect': 0,
  'partial': 0,
  'missed': 0,
  'spurious': 0,
  'possible': 3,
  'actual': 3,
  'precision': 1.0,
  'recall': 1.0,
  'f1': 1.0}}

In [17]:
results_per_tag

{'LOC': {'ent_type': {'correct': 2,
   'incorrect': 0,
   'partial': 0,
   'missed': 0,
   'spurious': 0,
   'possible': 2,
   'actual': 2,
   'precision': 1.0,
   'recall': 1.0,
   'f1': 1.0},
  'partial': {'correct': 2,
   'incorrect': 0,
   'partial': 0,
   'missed': 0,
   'spurious': 0,
   'possible': 2,
   'actual': 2,
   'precision': 1.0,
   'recall': 1.0,
   'f1': 1.0},
  'strict': {'correct': 2,
   'incorrect': 0,
   'partial': 0,
   'missed': 0,
   'spurious': 0,
   'possible': 2,
   'actual': 2,
   'precision': 1.0,
   'recall': 1.0,
   'f1': 1.0},
  'exact': {'correct': 2,
   'incorrect': 0,
   'partial': 0,
   'missed': 0,
   'spurious': 0,
   'possible': 2,
   'actual': 2,
   'precision': 1.0,
   'recall': 1.0,
   'f1': 1.0}},
 'PER': {'ent_type': {'correct': 1,
   'incorrect': 0,
   'partial': 0,
   'missed': 0,
   'spurious': 0,
   'possible': 1,
   'actual': 1,
   'precision': 1.0,
   'recall': 1.0,
   'f1': 1.0},
  'partial': {'correct': 1,
   'incorrect': 0,
   'parti