In [39]:
from docx import Document
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")


In [40]:
doc = Document("test_documents/93A_demand_letter_sample.docx")

for pnum, para in enumerate(doc.paragraphs):
    for rnum, run in enumerate(para.runs):
        if "date" in run.text.lower():
            run.text = run.text.replace("date", "DATE")
        print((pnum, rnum), run.text, run.style.name)

(0, 0) Your name Default Paragraph Font
(1, 0) Your address Default Paragraph Font
(2, 0) Your telephone  Default Paragraph Font
(2, 1) number Default Paragraph Font
(3, 0) Date Default Paragraph Font
(4, 0) Name of Merchant Default Paragraph Font
(5, 0) Merchant's address Default Paragraph Font
(7, 0) Dear Merchant: Default Paragraph Font
(9, 0) Under the provisions of Massachusetts General Laws, Chapter 93A, Section 9, I hereby make written Default Paragraph Font
(10, 0) demand for relief as outlined in that statute. Default Paragraph Font
(12, 0) On or about  Default Paragraph Font
(12, 1) {DATE} Default Paragraph Font
(12, 2) , the following unfair or deceptive act occurred: Default Paragraph Font
(12, 3)   Default Paragraph Font
(13, 0) {EXPLAIN WHAT HAPPENED} Default Paragraph Font
(15, 0) This unfair or deceptive act or practice is, in my opinion, declared unlawful by Section 2 of Chapter 93A, Default Paragraph Font
(16, 0) (you may want to give regulation number, if applicable)

In [90]:
role_description = """
You will process a DOCX document and return a JSON structure that turns the DOCX file into a template based on the following guidelines and examples.

Steps:
1. Analyze the document. Identify placeholder text that should be replaced with a variable name.
2. Mark optional paragraphs with conditional Jinja2 tags.
3. Text intended for verbatim output in the final document will remain unchanged.
4. The result will be a JSON structure that indicates which paragraphs and runs in the DOCX require modifications.

Example input, with paragraph and run numbers indicated:
[
    [0, 1, "Dear John Smith"],
    [1, 0, "This sentence can stay as is in the output and will not in the reply."],
    [2, 0, "[Optional: if you are a tenant, include this paragraph]"],
]

Example reply, indicating paragraph, run, the new text, the starting position and ending position of the text that will be replaced, and whether a new paragraph should be inserted (for conditional text):
{
    "results": [
        [0, 1, "{{ other_parties[0] }}", 5, 15, false],
        [2, 0, "{%p if is_tenant %}", 0, 0, true],
        [3, 0, "{%p endif %}", 0, 0, true],
    ]
}

The reply ONLY contains the modified text and starting/ending positions for the replacements.
"""

rules = """
Rules for variable names:
    1. Variables usually refer to people or their attributes.
    2. People are stored in lists.
    3. We use Docassemble objects and conventions.
    4. Use variable names and patterns from the list below. Invent new variable names when it is appropriate.

List names for people:
    users (for the person benefiting from the form, especially when for a pro se filer)
    other_parties (the opposing party in a lawsuit or transactional party)
    plaintiffs
    defendants
    petitioners
    respondents
    children
    spouses
    parents
    caregivers
    attorneys
    translators
    debt_collectors
    creditors
    witnesses
    guardians_ad_litem
    guardians
    decedents
    interested_parties

    Name Forms:
        users (full name of all users)
        users[0] (Full name)
        users[0].name.first (First name only)
        users[0].name.middle (Middle name only)
        users[0].name.middle_initial() (First letter of middle name)
        users[0].name.last (Last name only)
        users[0].name.suffix (Suffix of user's name only)

Attribute names (replace `users` with the appropriate list name):
    Demographic Data:
        users[0].birthdate (Birthdate)
        users[0].age_in_years() (Calculated age based on birthdate)
        users[0].gender (Gender)
        users[0].gender_female (User is female, for checkbox field)
        users[0].gender_male (User is male, for checkbox field)
        users[0].gender_other (User is not male or female, for checkbox field)
        users[0].gender_nonbinary (User identifies as nonbinary, for checkbox field)
        users[0].gender_undisclosed (User chose not to disclose gender, for checkbox field)
        users[0].gender_self_described (User chose to self-describe gender, for checkbox field)
        user_needs_interpreter (User needs an interpreter, for checkbox field)
        user_preferred_language (User's preferred language)

    Addresses:
        users[0].address.block() (Full address, on multiple lines)
        users[0].address.on_one_line() (Full address on one line)
        users[0].address.line_one() (Line one of the address, including unit or apartment number)
        users[0].address.line_two() (Line two of the address, usually city, state, and Zip/postal code)
        users[0].address.address (Street address)
        users[0].address.unit (Apartment, unit, or suite)
        users[0].address.city (City or town)
        users[0].address.state (State, province, or sub-locality)
        users[0].address.zip (Zip or postal code)
        users[0].address.county (County or parish)
        users[0].address.country (Country)

    Other Contact Information:
        users[0].phone_number (Phone number)
        users[0].mobile_number (A phone number explicitly labeled as the "mobile" number)
        users[0].phone_numbers() (A list of both mobile and other phone numbers)
        users[0].email (Email)

    Signatures:
        users[0].signature (Signature)
        signature_date (Date the form is completed)

    Information about Court and Court Processes:
        trial_court (Court's full name)
        trial_court.address.county (County where court is located)
        trial_court.division (Division of court)
        trial_court.department (Department of court)
        docket_number (Case or docket number)
        docket_numbers (A comma-separated list of docket numbers)
        
When No Existing Variable Name Exists:
    1. Craft short, readable variable names in python snake_case.
    2. Represent people with lists, even if only one person.
    3. Use valid Python variable names within complete Jinja2 tags, like: {{ new_variable_name }}.

    Special endings:
        Suffix _date for date values.
        Suffix _value or _amount for currency values.

    Examples: 
      "(State the reason for eviction)" transforms into `{{ eviction_reason }}`.
"""

In [81]:
doc = Document("test_documents/93A_demand_letter_sample.docx")

items = []
for pnum, para in enumerate(doc.paragraphs):
    for rnum, run in enumerate(para.runs):
        items.append([pnum, rnum, run.text])

In [93]:
import tiktoken
import json
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
token_count = len(encoding.encode(role_description + rules + repr(items)))
schema_json = json.dumps(schema)

# Define the schema variable

response = openai.ChatCompletion.create(
  model="gpt-4", # gpt-3.5-turbo fails at this task
  messages=[
    {
        "role": "system",
        "content": role_description + rules
    },
    {
      "role": "user",
      "content": repr(items)
    }
  ],
  temperature=.5,
  max_tokens=3900 - token_count,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0,
)

print(response)

{
  "id": "chatcmpl-8AOTTzxgE3VkKhZzBYzH9TKOOeqpz",
  "object": "chat.completion",
  "created": 1697488051,
  "model": "gpt-4-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "{\n    \"results\": [\n        [0, 0, \"{{ users[0].name.full() }}\", 0, 9, false],\n        [1, 0, \"{{ users[0].address.block() }}\", 0, 12, false],\n        [2, 0, \"{{ users[0].phone_number }}\", 0, 13, false],\n        [3, 0, \"{{ signature_date }}\", 0, 4, false],\n        [4, 0, \"{{ other_parties[0].name.full() }}\", 0, 14, false],\n        [5, 0, \"{{ other_parties[0].address.block() }}\", 0, 17, false],\n        [7, 0, \"Dear {{ other_parties[0].name.first }}:\", 0, 13, false],\n        [12, 1, \"{{ incident_date }}\", 0, 6, false],\n        [13, 0, \"{{ incident_description }}\", 0, 20, false],\n        [16, 0, \"{{ applicable_regulation_number }}\", 0, 50, false],\n        [17, 0, \"{{ quoted_text_or_section }}\", 0, 89, false],\n        

In [94]:
import json
json_string = response.choices[0]["message"]["content"]

# Convert the string to a JSON object
json_object = json.loads(json_string)

# Access the data in the JSON object
results = json_object["results"]

for item in results:
    print(item[2])


{{ users[0].name.full() }}
{{ users[0].address.block() }}
{{ users[0].phone_number }}
{{ signature_date }}
{{ other_parties[0].name.full() }}
{{ other_parties[0].address.block() }}
Dear {{ other_parties[0].name.first }}:
{{ incident_date }}
{{ incident_description }}
{{ applicable_regulation_number }}
{{ quoted_text_or_section }}
{{ injury_or_loss_description }}
{{ demanded_relief_description }}
{{ users[0].name.full() }}


In [95]:
def replace_text(original, new_text, start_pos, end_pos):
    # Insert new_text at start_pos and delete text from start_pos to end_pos
    before = original[:start_pos]
    after = original[end_pos:]
    return before + new_text + after    

new_paragraphs = []

for item in results:
    if item[5]:
        new_paragraphs.append(item)
    else:
        pnum = item[0]
        rnum = item[1]
        new_text = item[2]
        start_pos = item[3]
        end_pos = item[4]
        para = doc.paragraphs[pnum]
        run = para.runs[rnum]
        run.text = replace_text(run.text, new_text, start_pos, end_pos)

# Don't do the new paragraphs just yet, that will come next

doc.save("test_documents/93A_demand_letter_sample_output.docx")