In [None]:
import json
import os
import subprocess

import numpy as np
import pandas as pd

from base64 import b64encode
from io import BytesIO
from zipfile import ZipFile

from anthropic import Anthropic
from requests import get
from tqdm import tqdm

In [None]:
_dir = "./PREM Historical records/"

files = [
    f
    for f in [
        root + "/" + f
        for root, dirs, files in os.walk(_dir)
        for f in [f for f in files if f.endswith((".pdf", ".PDF"))]
    ]
]

for file in files:
    os.mkdir(i.removesuffix(".pdf"))
    # ghostscript installation required
    results = subprocess.run(
        [
            "gs",
            "-dNOPAUSE",
            "-sDEVICE=jpeg",
            "-r250",
            f'-sOutputFile={file.removesuffix(".pdf")}/-%02d.jpg',
            f"{file}",
            "-dBATCH",
        ],
        stdout=subprocess.PIPE,
    )
    print(results.stdout.decode())

In [None]:
client = Anthropic(
    api_key=""
)
image_media_type = "image/jpeg"


# https://evidence-hou.se/events/big-llm-hack-24/data/correspondence.html
content = get(
    "https://evidence-house-public.s3.eu-west-2.amazonaws.com/national_archive_records.zip"
).content

with ZipFile(BytesIO(content), "r") as f:
    f.extractall("./")
os.rename(" PREM Historical records", "PREM Historical records")

In [None]:
fields = [
    {
        "title": "Date",
        "tag": "date",
        "description": "The date the letter was written in YYYY-MM-DD format, or N/A if unknown. If the year and month are known but not the day, use the first day of the month.",
    },
    {
        "title": "Sender",
        "tag": "sender",
        "description": "The person or department which sent the letter, or N/A if unknown",
    },
    {
        "title": "Recipient",
        "tag": "recipient",
        "description": "The person or department which received the letter, or N/A if unknown",
    },
    {
        "title": "Subject",
        "tag": "subject",
        "description": "A one-line subject of the letter if present, otherwise infer this yourself from the context",
    },
    {
        "title": "Document type",
        "tag": "document-type",
        "description": "State the category type of the document (letter, meeting minutes, balance sheet etc)",
    },
    # forgot to add bool for handwritten or not
]

In [None]:
fields_string = "\n".join(
    f"- {field['title']}: {field['description']}. Use the key name '{field['tag']}' ."
    for field in fields
)
prompt = f"""

Transcribe the text in this image in full, in json format, with the key "text".

Please also extract the following fields:

{fields_string}

""".strip()

print(prompt)

In [None]:
image_data = []
for file in files:
    with open(file, "rb") as f:
        image_data.append(base64.b64encode(f.read()).decode("utf-8"))

In [None]:
for file, image in tqdm(zip(files, image_data)):
    if os.path.exists(file.rstrip(".jpg") + ".txt"):
        print("skip")
        continue
    print("api")
    message = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": image_media_type,
                            "data": image,
                        },
                    },
                    {"type": "text", "text": prompt},
                ],
            }
        ],
    )
    parsed_doc = message.content[0].text
    with open(file.rstrip(".jpg") + ".txt", "w") as f:
        f.write(parsed_doc)

In [None]:
files = [
    f
    for f in [
        root + "/" + f
        for root, dirs, files in os.walk("../")
        for f in [f for f in files if f.endswith((".jpg"))]
    ]
]

json_list = []
for idx, file in enumerate(files):
    with open(file.removesuffix(".jpg") + ".txt", "r") as f:
        data = f.read()
        try:
            if not data.startswith("{"):
                data = data[data.index("{") :]
        except ValueError as e:
            data = '{\n    "text": "",\n    "date": "N/A",\n    "sender": "N/A",\n    "recipient": "N/A",\n    "subject": "N/A",\n    "document-type": "N/A"\n}'
        if not data.endswith("{"):
            data = data[: data.rfind("}") + 1]
        try:
            data.index("}")
        except ValueError as ve:
            print(f"{file} is not complete")
            continue
        try:
            data.index('"date":')
        except ValueError as ve:
            print(f"{file} is not complete")
            continue
        if data.index("}") < data.index('"date":'):
            data = (
                data[: data.index("}")]
                + ","
                + data[data[data.find("{") + 1 :].find("{") + 2 :]
            )
        try:
            json_f = json.loads(data)
            json_f["source"] = file.removeprefix("../PREM Historical records/")
            json_list.append(json_f)
        except ValueError as e:
            print(file.removesuffix(".jpg") + ".txt" + "could not process file")
            continue

    # missing delimeters (",") after a kv pair is also an issue

In [None]:
df = pd.DataFrame.from_records(json_list).replace("N/A", np.nan)
# df["source"] = df["source"].apply(lambda x: "/".join(x.split("/")[:-1]) + "/" + x.split("/")[-1].zfill(8))
# df = df.sort_values(["source"]).reset_index(drop=True)

In [None]:
# filter to rows that have both a sender and recipient
subset = df.dropna(subset=["sender", "recipient"])

In [None]:
# filter to rows that have both a sender and recipient
subset = df.dropna(subset=["sender", "recipient"])
# filter to rows identified as letters
subset = subset[subset["document-type"].isin(["Letter", "letter"])]
subset.to_csv("single-page-letters.csv", index=None)