Install the required Python modules. Depending on your environment, you may need additional modules.

In [None]:
# module required to parse PDF file
%pip install pdfplumber
# for the creation of tables, similar to R
%pip install pandas

Importing of required dependencies:

In [None]:
import os
import pdfplumber
from dotenv import load_dotenv
import anthropic
from IPython.display import Markdown, display, update_display
import json
import pandas as pd

Define the path to the folder containing your PDF files:

In [None]:
folderPath = r"C:\your\folder"

Add the names of PDF files to a list:

In [None]:
def listPDFs(path):
    return [f for f in os.listdir(path) if f.endswith(".pdf")]

In [None]:
listOfReceipts = listPDFs(folderPath)

This code uses an API key for Anthropic. You can substitute another LLM, though I haven’t tested alternatives yet. For background and examples, see Ed Donner’s Udemy course and GitHub repository (linked in the README). Load your API key below:

In [None]:
load_dotenv(override=True)
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')

Extract all text from PDF files and add to a dictionary:

In [None]:
# extracting all text from the PDF files within a folder and saving them in a dictionary
def extractTextFromReceipts(folderPath, listOfReceipts):
    extractedTexts = {}

    for receiptName in listOfReceipts:
        filePath = os.path.join(folderPath, receiptName)
        allText = ""

        try:
            with pdfplumber.open(filePath) as pdf:
                for page in pdf.pages:
                    text = page.extract_text()
                    if text:
                        allText += text + "\n"
        except Exception as e:
            print(f"Error processing {filePath}: {e}")
            allText = None

        extractedTexts[receiptName] = allText

    return extractedTexts

In [None]:
allExtractedReceipts = extractTextFromReceipts(folderPath, listOfReceipts)

Connect to Anthropic:

In [None]:
claude = anthropic.Anthropic()

Set the system message to guide the LLM’s behavior and response style. Ensure the output format is structured in a way that can later be converted into a Python dictionary.

In [None]:
system_message = """
    You are an expert in interpreting receipts and bills from text extracts. Your task is to identify the transaction date, total spend, and the overall type of expense. Assume all amounts are in British Pounds unless the text clearly indicates otherwise.
    You understand that OCR quality may be poor — for example, digits like '1' and '7' may be confused. If the receipt contains multiple line items, calculate their sum and compare it to any stated total. If multiple totals are present, use judgment to determine the most likely correct figure.
    If the receipt text is empty or unreadable, return "NA" for all fields.
    If multiple dates appear, select the earliest one that plausibly represents the transaction date. Format your response in JSON as shown below:\n """
system_message += """
    {"name of file": "nameOfPDFFile.pdf", 
        "date": "15 December 2021",
        "total spend": "£18.43",
        "type": "train ticket"}
    """

Loop through each entry in the dictionary of parsed receipt texts and call the LLM to extract the transaction date, total amount, and expense type. Combine the results into a DataFrame, which can later be joined with bank statement data (not shown).

In [None]:
# Initialise list to collect parsed outputs
parsedReceipts = []

# Loop through each receipt
for filename, text in allExtractedReceipts.items():
    user_prompt = f"""
        Please extract the transaction date, total spend, and type of expense from the receipt text below. The name of the PDF file may offer clues about the date or nature of the transaction.
      
        name of file: {filename}

        Text extract of receipt:
        {text}
        """

    try:
        # Call Claude
        message = claude.messages.create(
            model="claude-3-haiku-20240307",
            max_tokens=400,
            temperature=0,
            system=system_message,
            messages=[{"role": "user", "content": user_prompt}],
        )

        # Clean and parse JSON response
        raw = message.content[0].text.strip("`").strip("json").strip()
        parsed = json.loads(raw)

        # Append to list
        parsedReceipts.append(parsed)

    except Exception as e:
        print(f"Error processing {filename}: {e}")
        parsedReceipts.append({
            "name of file": filename,
            "date": "NA",
            "total spend": "NA",
            "type": "NA"
        })

# Convert to DataFrame
df_receipts = pd.DataFrame(parsedReceipts)

# Preview
print(df_receipts.head())


You can save the resulting DataFrame as a CSV file for downstream use.

In [None]:
# save the data-frame as a CSV file
df_receipts.to_csv(r"C:\your\folder\receiptsSummary.csv", index=False, encoding="utf-8-sig")