In [None]:
import zipfile
from lxml import etree
import re
import string

import re
from collections import defaultdict

In [None]:
def process_footnote_body(elem, inside_ins=False, inside_del=False):
    """
    Recursively process footnote XML and wrap ins/del runs as needed.
    Returns a string for the footnote body.
    """
    tag = elem.tag
    result = []

    # Handle insertions and deletions
    if tag.endswith('ins'):
        for child in elem:
            result.append(process_footnote_body(child, inside_ins=True, inside_del=inside_del))
        return ''.join(result)
    if tag.endswith('del'):
        for child in elem:
            result.append(process_footnote_body(child, inside_ins=inside_ins, inside_del=True))
        return ''.join(result)

    # Paragraph break
    if tag.endswith('p'):
        result.append('<f_break>')

    # Field code
    elif tag.endswith('instrText') and elem.text:
        result.append(f'<field>{elem.text}</field>')

    # Inserted or deleted text
    elif tag.endswith('t') and elem.text:
        text = elem.text
        if inside_ins:
            result.append(f'<trace>{text}</trace>')
        elif inside_del:
            if "Drafter's Note" in text:
                result.append(text)
            else:
                result.append(f'<trace.deleted/>')
            # result.append(f'<trace.deleted/>')
        else:
            result.append(text)
    elif tag.endswith('delText') and elem.text:
        # Only in a <w:del> context, but for safety:
        text = elem.text
        if inside_del:
            if "Drafter's Note" in text:
                result.append(text)
            else:
                result.append(f'<trace.deleted/>')
        else:
            result.append(text)

    # Recursively process children
    for child in elem:
        result.append(process_footnote_body(child, inside_ins=inside_ins, inside_del=inside_del))

    return ''.join(result)

In [None]:
def extract_full_text_with_footnotes_track(doc_tree, footnote_tree):
    ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
    footnote_map = {}
    for fn in footnote_tree.findall('.//w:footnote', namespaces=ns):
        fn_id = fn.get('{%s}id' % ns['w'])
        footnote_map[fn_id] = fn

    def process_element(elem, inside_ins=False, inside_del=False, state=None):
        if state is None:
            state = {'wrap_next_t_with_url': False}
        tag = elem.tag
        result = []

        if tag.endswith('ins'):
            ins_parts = []
            for child in elem:
                ins_parts.extend(process_element(child, inside_ins=True, inside_del=inside_del, state=state))
            if ins_parts:
                result.append('<trace>')
                result.extend(ins_parts)
                result.append('</trace>')
            return result

        if tag.endswith('del'):
            del_parts = []
            for child in elem:
                del_parts.extend(process_element(child, inside_ins=inside_ins, inside_del=True, state=state))
            if del_parts:
                if "Drafter's Note" in del_parts:
                    result.extend(del_parts)
                else:
                    result.append(f'<trace.deleted/>')
                # result.extend(del_parts)
                # result.append(' />')
            return result

        if tag.endswith('p'):
            result.append('\n')
        elif tag.endswith('instrText'):
            if elem.text:
                result.append(f'<field>{elem.text}</field>')
                if 'url.ref.id' in elem.text:
                    state['wrap_next_t_with_url'] = True
        elif tag.endswith('footnoteReference'):
            footnote_id = elem.get('{%s}id' % ns['w'])
            footnote_elem = footnote_map.get(footnote_id)
            if footnote_elem is not None:
                traced_body = process_footnote_body(footnote_elem, inside_ins, inside_del)
            else:
                traced_body = ''
            footnote_markup = f'<footnote><footnote.body>{traced_body}</footnote.body></footnote>'
            result.append(footnote_markup)
        elif tag.endswith('t'):
            # Don't print text if parent <w:r> contains a <w:footnoteReference>
            parent = elem.getparent()
            skip_this = False
            if parent is not None and parent.tag.endswith('r'):
                for child in parent:
                    if child.tag.endswith('footnoteReference'):
                        skip_this = True
                        break
            if not skip_this and elem.text:
                if state.get('wrap_next_t_with_url', False):
                    result.append(f'<url>{elem.text}</url>')
                    state['wrap_next_t_with_url'] = False
                else:
                    result.append(elem.text)
        elif tag.endswith('br'):
            # Just add <br>, don't touch the url flag
            result.append('<br>')
        # You can add handling for other formatting elements here if needed

        # Recursively process children, but only the first <w:t> after the flag should consume the flag
        for child in elem:
            result.extend(process_element(child, inside_ins=inside_ins, inside_del=inside_del, state=state))

        return result

    full_text = process_element(doc_tree)
    return ''.join(full_text)

Execution starts here

In [None]:
docx_path = r''

with zipfile.ZipFile(docx_path, 'r') as docx_zip:
    xml_content = docx_zip.read('word/document.xml')
    footnote_xml_content = docx_zip.read('word/footnotes.xml')

doc_tree = etree.fromstring(xml_content)
footnote_tree = etree.fromstring(footnote_xml_content)

whole_text = extract_full_text_with_footnotes_track(doc_tree, footnote_tree)

# # Parse the footnotes XML
footnote_tree = etree.fromstring(xml_content)

# Pretty print and save to a file
pretty_footnote_xml = etree.tostring(
    footnote_tree,
    pretty_print=True,
    encoding='utf-8',
    xml_declaration=True
)

with open("doc_xml.xml", 'wb') as f:
    f.write(pretty_footnote_xml)

# You can now use `raw_text` in another application or write it to a file
with open('test_doc_output.txt', 'w', encoding="utf-8") as f:
    f.write(whole_text)