Converting rtf to docx

In [None]:
import pypandoc
from docx import Document
import zipfile
from lxml import etree

In [None]:
def convert_rtf_to_docx(input_file, output_file):
    try:
        # Convert RTF to DOCX
        pypandoc.convert_file(input_file, 'docx', outputfile=output_file)
        print(f"Conversion successful: {output_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

In [None]:
def extract_xml_from_docx(docx_path, xml_filename):
    with zipfile.ZipFile(docx_path, 'r') as docx_zip:
        if xml_filename in docx_zip.namelist():
            xml_content = docx_zip.read(xml_filename)
            return etree.XML(xml_content)
        else:
            return None

Execution starts here

In [None]:
def main_process(rtf_path, output_docx_file, text_output_file, namespaces):
    convert_rtf_to_docx(rtf_path,output_docx_file)
    tree = extract_xml_from_docx(output_docx_file, 'word/document.xml')
    #save the xml content to a file
    if tree is not None:
        with open(text_output_file, 'wb') as f:
            f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True, encoding='UTF-8'))
        print(f"XML content saved to {text_output_file}")

In [None]:
from xml.dom.minidom import parseString
import xml.etree.ElementTree as ET

def pretty_print_xml(input_file_path, output_file_path):
    # Read the XML content from the text file
    with open(input_file_path, 'r') as file:
        xml_content = file.read()

    # Parse the XML content
    dom = parseString(xml_content)

    # Pretty print the XML with indentation
    pretty_xml_as_string = dom.toprettyxml(indent="  ")

    # Write the pretty printed XML to the output file
    with open(output_file_path, 'w') as file:
        file.write(pretty_xml_as_string)

# Example usage
input_file = r'NYLB\NYLB32revisioncopy.txt'  # Path to your input text file containing XML
output_file = 'vendor_output_32.xml'  # Path to save the pretty printed XML file

pretty_print_xml(input_file, output_file)

In [None]:
namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
rtf_path= 'NYLB/NYLB 32 (revision copy).rtf'
output_docx_file = rtf_path.replace('.rtf', '.docx')
text_output_file = "text_" + output_docx_file.split('/')[-1].replace('.docx', '.txt')

main_process(rtf_path, output_docx_file,text_output_file,namespaces)



In [None]:
# import PyRTF
from PyRTF import parser


In [None]:
# from pyrtf.parser import RtfParser
from PyRTF.Elements import Document, Text
import PyRTF
import xml.etree.ElementTree as ET
#import parser
from PyRTF.parser import RtfParser

In [None]:

def extract_field_codes_from_rtf(rtf_path):
    # Read the RTF file
    with open(rtf_path, 'r') as file:
        rtf_content = file.read()
    
    # Parse the RTF content
    # parser = PyRTF.parser()
    parser = parser.RtfParser()
    doc = Document()
    parser.parse(rtf_content, doc)
    
    field_codes = []

    # Iterate through the text elements to find field codes
    for element in doc.content:
        if isinstance(element, Text):
            # Assuming field codes have a specific pattern in your RTF
            if element.text.startswith('FIELD'):
                field_codes.append(element.text)
    
    return field_codes

def field_codes_to_xml(field_codes):
    # Create the root XML element
    root = ET.Element("FieldCodes")

    # Add each field code as a child element
    for code in field_codes:
        field_element = ET.SubElement(root, "FieldCode")
        field_element.text = code

    # Return the XML string
    return ET.tostring(root, encoding='unicode')

def main():
    rtf_path = r'NYLB/NYLB 32 (revision copy).rtf'  # Replace with your RTF document path
    field_codes = extract_field_codes_from_rtf(rtf_path)
    xml_data = field_codes_to_xml(field_codes)

    # Print or save the XML data
    print(xml_data)
    with open('field_codes.xml', 'w') as xml_file:
        xml_file.write(xml_data)

if __name__ == "__main__":
    main()

In [None]:
import re
import xml.etree.ElementTree as ET

def clean_rtf_text(text):
    """Remove RTF formatting codes from text."""
    return re.sub(r'\\[a-zA-Z]+\d*\s*|[{}]', '', text).strip()

def extract_rtf_fields_to_xml(rtf_file_path, xml_output_path):
    """Extract fields from an RTF file and save them as XML."""
    try:
        with open(rtf_file_path, 'r', encoding='utf-8', errors='ignore') as file:
            rtf_content = file.read()

        # print(f"Processing RTF file: {rtf_file_path} {rtf_content}")
        # Pattern to match fields with optional results
        field_pattern = r'\\field\s*\{[^}]*\\fldinst\s*\{([^}]*)\}[^}]*(?:\\fldrslt\s*\{([^}]*)\})?[^}]*\}'
                    # r'(\\field\s*\{[^}]*\\fldinst\s*\{[^}]*\}[^}]*\\fldrslt\s*\{[^}]*\}[^}]*\})'
        matches = re.findall(field_pattern, rtf_content, re.IGNORECASE | re.DOTALL)
        print("MAtches found:", matches)
        # Create XML structure
        root = ET.Element('RTFFields')
        for i, (instruction, result) in enumerate(matches):
            field_element = ET.SubElement(root, 'Field', id=str(i + 1))

            # Add instruction
            inst_element = ET.SubElement(field_element, 'Instruction')
            inst_element.text = clean_rtf_text(instruction)

            # Add result if available
            if result:
                result_element = ET.SubElement(field_element, 'Result')
                result_element.text = clean_rtf_text(result)

        # Write XML to file
        tree = ET.ElementTree(root)
        tree.write(xml_output_path, encoding='utf-8', xml_declaration=True)

        print(f"Extracted {len(matches)} fields from '{rtf_file_path}' to '{xml_output_path}'")

    except FileNotFoundError:
        print(f"Error: RTF file '{rtf_file_path}' not found.")
    except Exception as e:
        print(f"Error: {str(e)}")

def main():
    # HARDCODED VALUES - Change these as needed
    RTF_FILE_PATH = 'NYLB/NYLB 32 (revision copy).rtf'
    XML_OUTPUT_PATH = 'fields_output.xml'

    extract_rtf_fields_to_xml(RTF_FILE_PATH, XML_OUTPUT_PATH)

if __name__ == "__main__":
    print("Running simple RTF field extraction...")
    main()

In [None]:
import re

def clean_rtf_and_enclose_fields(rtf_content):
    # Regular expression to match RTF field codes
    field_code_pattern = r'\\field\s*\{[^}]*\\fldinst\s*\{([^}]*)\}[^}]*(?:\\fldrslt\s*\{([^}]*)\})?[^}]*\}'
    
    # Find all field codes using regex
    field_codes = re.findall(field_code_pattern, rtf_content, re.IGNORECASE | re.DOTALL)

    # Replace field codes with <field> tags, preserving spacing
    for instruction, result in field_codes:
        rtf_content = rtf_content.replace(instruction, f'<field>{instruction}</field>')

    # Preserve paragraph, line, and section breaks
    rtf_content = re.sub(r'\\pard', '', rtf_content)  # Paragraph breaks
    rtf_content = re.sub(r'\\par ', '--para--', rtf_content)  # Paragraph breaks

    rtf_content = re.sub(r'\\linex0', '', rtf_content)  # Line breaks
    rtf_content = re.sub(r'\\line', '\n', rtf_content)  # Line breaks
    rtf_content = re.sub(r'\\sectdefaultcl', '', rtf_content)  # Section breaks
    rtf_content = re.sub(r'\\sectd', '', rtf_content)
    # rtf_content = re.sub(r'\\sect', '\n--- Section Break ---\n', rtf_content)  # Section breaks
    # rtf_content = re.sub(r'\\tab', '--tab--', rtf_content)  # Section breaks
    

    # Remove all other RTF control words except field codes and preserve spacing
    cleaned_content = re.sub(r'\\[a-z]+(\d+)?(\s+)?', '', rtf_content)  # Remove RTF control words except breaks
    cleaned_content = re.sub(r'\{|\}', '', cleaned_content)  # Remove braces, except for field codes
    # cleaned_content = re.sub(r'--para--', '', cleaned_content) 
    cleaned_content = re.sub(r'\\\'3f', '', cleaned_content)  # Replace tab markers with actual tabs
    # cleaned_content = re.sub(r'\\\*', '', cleaned_content)  # Replace tab markers with actual tabs
    cleaned_content = re.sub(r'(<field.*?>)(\s*<field.*?>)+', "<field>" , cleaned_content)
    cleaned_content = re.sub(r'(</field.*?>)(\s*</field.*?>)+', "</field>" , cleaned_content)
    
    return cleaned_content


In [None]:
def clean_junk_content(content):
    # Remove all content above <?phoenix
    content = re.sub(r'^.*?(?=<\?phoenix)', '', content, flags=re.DOTALL)

    # Remove all content after {\*\themedata} and closing bracket
    content = re.sub(r'{\\\*\\themedata[\s\S]*$', '', content, flags=re.DOTALL)

    return content

In [None]:
# def enclose_tags(content):
#     #para tags
#     content = re.sub(r'(\\par\s*)', r'<para>\1</para>', content)
#     content = re.sub(r'(\\pard\s*)', r'<para>\1</para>', content)
    
#     return content


In [None]:
# Example usage
RTF_FILE_PATH = 'NYLB/NYLB 32 (revision copy).rtf'

with open(RTF_FILE_PATH, 'r', encoding='utf-8', errors='ignore') as file:
    rtf_content = file.read()
    clean_rtf_content = clean_junk_content(rtf_content)
    cleaned_rtf = clean_rtf_and_enclose_fields(clean_rtf_content)
    # code_content = retain_field_codes(clean_rtf_content)
    # modified_content = enclose_tags(code_content)

    with open('cleaned_file.txt', 'w') as cleaned_file:
        cleaned_file.write(cleaned_rtf)

# modified_content = enclose_tags(clean_rtf_content)

# Clean RTF content while preserving field codes

# print(cleaned_rtf)

In [None]:
lines = cleaned_rtf.split('\n')
print(lines[:5])

In [None]:
log_text= []
line_count=0
final_text_list = []
for i, line in enumerate(lines[:10]):
    # print(f"Line {i}: {line.strip()}")
    if i == 0:
        final_text_list.append(line.strip()) 
        final_text_list.append("<chapter>")
        continue
    #append lines till not empty
    if line.strip() == "":
        final_text_list.append(line)
        line_count+=1
    else:
        # If an empty line is encountered
        #check if there is ch.rh is present within <field> tags
        field = re.findall(r'<field>(.*?)</field>', line, re.DOTALL)[0]
        print(f"Field found: {field}")
        if 'ch.rh' in field:
            metadata_value = field.split('ch.rh=')[1].split(' ')[0].replace('"', '').replace("'", "")
            final_text_list.append(f"<metadata.block><metadata field=\"right.running.head\"><value>{metadata_value}</value></metadata></metadata.block>")
            final_text_list.append("<front>")
            if "Chapter" in line:
                remainder_line = line.split("Chapter")[-1].strip().split(" ")
                final_text_list.append(f"<outline.name.block><label>Chapter</label><designator>{remainder_line[0]}</designator><name>{remainder_line[1]}</name></outline.name.block><scope.statement.block>")
            else:
                log_text.append("Chapter name might not be present")
            continue
        if "Scope Statement" in line:
            final_text_list.append("<scope.statement.block>")
            
            final_text_list.append("</scope.statement.block>")

        
        final_text_list.append("</front>")

final_text_list.append("</chapter>")


In [None]:
final_text_list

In [None]:
import zipfile
from lxml import etree

import zipfile
from lxml import etree

def extract_full_text_with_breaks(docx_path):
    # Open the .docx file as a zip archive
    with zipfile.ZipFile(docx_path, 'r') as docx_zip:
        # Read the document.xml file
        xml_content = docx_zip.read('word/document.xml')
        
    # Parse the XML content
    tree = etree.fromstring(xml_content)
    
    # Extract all text content, including field codes, and retain breaks
    full_text = []
    for elem in tree.iter():
        # Add a newline for paragraph breaks
        if elem.tag.endswith('p'):
            full_text.append('\n')
        # Add text content
        elif elem.tag.endswith('t'):
                if elem.tag.endswith('instrText'):
                    if elem.text:
                        full_text.append("<field>" + elem.text + '</field>')
                else:
                    if elem.text:
                        full_text.append(elem.text)
        # Add a newline for line breaks (often represented by <w:br>)
        elif elem.tag.endswith('br'):
            full_text.append('\n')
    
    # Join all extracted text into a single string
    return ''.join(full_text)
# Usage
docx_path = 'NYLB/doc_NYLB 32 (revision copy).docx'
whole_text = extract_full_text_with_breaks(docx_path)

# You can now use `raw_text` in another application or write it to a file
with open('doc_iiioutput.txt', 'w', encoding="utf-8") as f:
    f.write(whole_text)

In [None]:
#Main execution starts here

In [None]:
def extract_text_between_tags(text):
    # Define the regular expression pattern
    pattern = r'<field>(.*?)</field>'
    
    # Use re.findall to find all matches
    matches = re.findall(pattern, text)
    
    return matches

In [None]:
def remove_text_between_tags(text):
    # Define the regular expression pattern to match the tags and the text between them
    pattern = r'<field>.*?</field>'
    
    # Use re.sub to replace the matched text with an empty string
    result = re.sub(pattern, '', text)
    
    return result

In [None]:
def get_ending_treated(text):
    # text_lines = text.split('\n')
    return_text = []
    #save the lines until you find a line that starts with "Research reference"
    for i,line in enumerate(text):
        if line.lower().startswith("Research References".lower()):
            break
        return_text.append("<para><para.text>"+line.strip()+"</para.text></para>")
    return return_text, i

In [None]:
def find_block_ending(text, opening_tag_name, closing_tag_name):
    return_text = []
    for i, line in enumerate(text):
        if "<field>" not in line:
            break
        return_text.append(opening_tag_name+line.strip()+closing_tag_name)
    return return_text, i

In [None]:
# def merge_text(headings_list):
#     # Initialize an empty list to store the result
#     result = []
#     target_text = "Some text here"  # The text to retain only the first occurrence of consecutive matches
    
#     # Initialize a flag to track consecutive matches
#     previous_was_target = False
    
#     # Iterate over each element in the list
#     for element in headings_list:
#         if element == target_text:
#             if not previous_was_target:
#                 # Append the first occurrence of consecutive matches
#                 result.append(element)
#                 previous_was_target = True
#         else:
#             # Append non-matching elements
#             result.append(element)
#             previous_was_target = False
    
#     return result

    

In [None]:
def merge_consecutive_text(input_dict):
    cleaned_dict = {}
    previous_value = None
    target_text = "Some text here"
    
    for key, value in input_dict.items():
        if value == target_text:
            # If the current value matches the target string and is same as previous, skip adding it
            if value == previous_value:
                continue
        # Add the current value to the cleaned dictionary
        cleaned_dict[key] = value
        # Update previous_value for next iteration
        previous_value = value

    return cleaned_dict

In [None]:
def filter_lines_without_field(lines):
    # Initialize an empty dictionary to store the filtered lines with their indices
    filtered_lines_dict = {}

    # Iterate over each line with its index
    for index, line in enumerate(lines):
        if line.strip() != "":
            # Check if the line does not contain the <field> tag
            if ('<field>' not in line):
                # Add the line to the dictionary with its index as the key
                if '<>' not in line :
                    filtered_lines_dict[index] = line
                else:
                    filtered_lines_dict[index] = "Some text here"
            else:
                filtered_lines_dict[index] = "Some text here"

    return filtered_lines_dict


index = 1
result = filter_lines_without_field(lines[index:])
#pretty print the result with indentation
# llm_input = merge_text(result.values())
# for i in llm_input:
#     print(i)
# for key, value in result.items():
#     if value.strip():
#         print(f"Index: {key}, Value: {value}")

clean_values = merge_consecutive_text(result)
for key, value in clean_values.items():
    if value.strip():
        #pretty print dictionary
        print(f"Index: {key}, Value: {value}")



In [None]:
lines = whole_text.split('\n')
for i,line in enumerate(lines):
    if "<field>" in line:
        # print(i)
        break

#added first pheonix line
xml_text = lines[i].split("</field>")[-1].strip()

#Chapter start
xml_text += "\n<chapter>"

#Adding Metadata block
metadata_text = extract_text_between_tags(lines[i+1])[0]
if metadata_text.startswith("ch.rh"):
    metadata_value = metadata_text.split("=")[1].replace('"', '').replace("'", "").strip()
    xml_text += f"\n<metadata.block><metadata field=\"right.running.head\"><value>{metadata_value}</value></metadata></metadata.block>"
xml_text += "\n<front>"


# Adding outline name block
if "Chapter".lower() in lines[i+1].lower():
    remainder_line = lines[i+1].split("Chapter")[-1].strip().split(" ")
    xml_text += f"\n<outline.name.block><label>Chapter</label><designator>{remainder_line[0]}</designator><name>{remainder_line[1]}</name></outline.name.block>"
    i += 1  # Skip the next line as it has been processed

if "Scope Statement".lower() in lines[i+1].lower():
    xml_text += "\n<scope.statement.block>"
    xml_text += f"\n<para><para.text>{lines[i+2]}</para.text></para>"
    xml_text += "\n</scope.statement.block>"
    i += 2  # Skip the next two lines as they have been processed

if "Treated Elsewhere".lower() in lines[i+1].lower():
    xml_text += "\n<treated.elsewhere.block>"
    treated_text, index = get_ending_treated(lines[i+2:])
    index = i + 2 + index  # Adjust index to account for the lines processed
    xml_text += f"\n<para><para.text>{("\n".join(treated_text))}</para.text></para>"
    xml_text += "\n</treated.elsewhere.block>"
    i += 2  # Skip the next two lines as they have been processed

if line.strip().lower() == "Research References".lower():
    xml_text += "<research.reference.block>"
    reference_text, j = find_block_ending(lines[i+1],"<reference.entry><ref.text>","</ref.text></reference.entry>")
    xml_text += "\n".join(reference_text)
    xml_text += "\n</research.reference.block>"
    i += j+1 # Skip the next j lines as they have been processed




In [None]:
#start for loop from index
for i, line in enumerate(lines[index:]):
    i += index
    if line.strip().lower() == "Research References".lower():
        xml_text += "\n<research.reference.block>"
        # print(f"Processing Research References at line {i}: {line.strip()}")
        reference_text, j = find_block_ending(lines[i+1:],"<reference.entry><ref.text>","</ref.text></reference.entry>")
        # print("Reference text found:", reference_text)
        xml_text += "\n".join(reference_text)
        xml_text += "\n</research.reference.block>"
        i += j



In [None]:
print(xml_text)

In [None]:
from lxml import etree
import re
import xml.dom.minidom


def preprocess_xml(xml_string):
    # Replace the &dblsect; with a placeholder or remove it
    # You can replace it with a space or any other character you prefer

    # up_xml_string = re.sub(r'\skey="[^"]*"', 'kkey', xml_string)
    # up_xml_string = re.sub(r'&key;', '', xml_string)
    # up_xml_string = re.sub(r'&dblsect;', '__dblsect__', up_xml_string)
    # up_xml_string = re.sub(r'&mdash;', '__mdash__', up_xml_string)
    # up_xml_string = re.sub(r'&ldquo;', '__ldquo__', up_xml_string)
    # up_xml_string = re.sub(r'&rdquo;', '__rdquo__', up_xml_string)
    # up_xml_string = re.sub(r'&sect;', '__sect__', up_xml_string)
    # up_xml_string = re.sub(r'&dblpara;', '__dblpara__', up_xml_string)
    # up_xml_string = re.sub(r'&dollar;', '__dollar__', up_xml_string)
    # up_xml_string = re.sub(r'&para;', '__para__', up_xml_string)
    # up_xml_string = re.sub(r'&percnt;', '__percnt__', up_xml_string)
    # up_xml_string = re.sub(r'&lsqb;', '__lsqb__', up_xml_string)
    # up_xml_string = re.sub(r'&rsqb;', '__rsqb__', up_xml_string)
    # up_xml_string = re.sub(r'&hellip;', '__hellip__', up_xml_string)
    # up_xml_string = re.sub(r'&brace;', '__brace__', up_xml_string)
    # up_xml_string = re.sub(r'&emsp;', '__emsp__', up_xml_string)
    # up_xml_string = re.sub(r'&ndash;', '__ndash__', up_xml_string)
    # up_xml_string = re.sub(r'&bull;', '__bull__', up_xml_string)
    # up_xml_string = re.sub(r'&ballot;', '__ballot__', up_xml_string)
    # up_xml_string = re.sub(r'&emsp;', '__emsp__', up_xml_string)
    # up_xml_string = re.sub(r'&ndash;', '__ndash__', up_xml_string)
    # up_xml_string = re.sub(r'&bull;', '__bull__', up_xml_string)

    up_xml_string = re.sub(r'&', '&amp;', xml_string)

    return up_xml_string

def pretty_print_xml(xml_string):
    # Preprocess the XML to handle custom entities
    cleaned_xml = preprocess_xml(xml_string)
    
    # # Parse the XML string
    # parser = etree.XMLParser(resolve_entities=False)  # Disable entity resolution
    # tree = etree.fromstring(cleaned_xml, parser=parser)
    
    # # Pretty print the XML
    # pretty_xml = etree.tostring(tree, pretty_print=True, encoding='unicode')
    # Parse the XML string
    dom = xml.dom.minidom.parseString(cleaned_xml)
    # Pretty print with indentation
    pretty_xml_as_string = dom.toprettyxml(indent="  ")
    # return pretty_xml_as_string
    print(pretty_xml_as_string)
    with open('pretty_output.xml', 'w', encoding='utf-8') as f:
        f.write(pretty_xml_as_string)

# Example XML string
with open(r"C:\Users\6122060\Downloads\231373p.xml", 'r', encoding='utf-8') as file:
    xml_string = file.read()

pretty_print_xml(xml_string)

In [1]:
t = ['►<field>p.ct.id="6d0bf250b92611ef8747b569df12a67d|7|32:7"</field>The New York tax upon and with respect to personal income<footnote><footnote.body><f_break>1►<field>fn.fnref="1"</field><f_break>►<field>p.ct.id="6d0bf250b92611ef8747b569df12a67d|7|32:7"</field>See ►<field>st.ref.id="I05MHWY"</field>NYTL &ss;§§601 et seq.<f_break></footnote.body></footnote> imposes a tax on the income of every resident individual of the state.<footnote><footnote.body><f_break>2►<field>fn.fnref="2"</field><f_break>►<field>p.ct.id="6d0bf250b92611ef8747b569df12a67d|7|32:7"</field>See ►<field>st.ref.id="I05JOS2"</field>NYTL &s;§601.<f_break></footnote.body></footnote> The term &ldquo;“resident individual&rdquo;” applies only to natural persons. Partnerships, as such, are not subject to this tax.<footnote><footnote.body><f_break>3►<field>fn.fnref="3"</field><f_break>►<field>p.ct.id="6d0bf250b92611ef8747b569df12a67d|7|32:7"</field>See ►<field>st.ref.id="I05S46O"</field>NYTL &s;§605.<trace> </trace><f_break><trace>For discussion of the federal tax provisions relating to agricultural cooperative associations, see </trace><field>x.ref.id="I047H79"</field><trace>&ss;</trace><trace>§§</trace><trace>32:3 et seq.</trace><f_break>Research References<f_break>►<field>rc.ref.id="I05S46L"</field>State income tax treatment of partnerships and partners, 2 ALR 6th 1.<f_break></footnote.body></footnote>', '<trace.deleted/>']


In [2]:
t[0]

'►<field>p.ct.id="6d0bf250b92611ef8747b569df12a67d|7|32:7"</field>The New York tax upon and with respect to personal income<footnote><footnote.body><f_break>1►<field>fn.fnref="1"</field><f_break>►<field>p.ct.id="6d0bf250b92611ef8747b569df12a67d|7|32:7"</field>See ►<field>st.ref.id="I05MHWY"</field>NYTL &ss;§§601 et seq.<f_break></footnote.body></footnote> imposes a tax on the income of every resident individual of the state.<footnote><footnote.body><f_break>2►<field>fn.fnref="2"</field><f_break>►<field>p.ct.id="6d0bf250b92611ef8747b569df12a67d|7|32:7"</field>See ►<field>st.ref.id="I05JOS2"</field>NYTL &s;§601.<f_break></footnote.body></footnote> The term &ldquo;“resident individual&rdquo;” applies only to natural persons. Partnerships, as such, are not subject to this tax.<footnote><footnote.body><f_break>3►<field>fn.fnref="3"</field><f_break>►<field>p.ct.id="6d0bf250b92611ef8747b569df12a67d|7|32:7"</field>See ►<field>st.ref.id="I05S46O"</field>NYTL &s;§605.<trace> </trace><f_break><tr

In [None]:
blocks =[['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'Service Information 1', 'Service Information 2', 'Service Information 3', 'Service Information 4', 'Service Information 5', 'Service Information 6', 'Service Information 7', 'Service Information 8', 'Service Information 9', 'Service Information 10'], ['Table of Cases 1', 'Table of Cases 2', 'Table of Cases 3', 'Table of Cases 4', 'Table of Cases 5', 'Table of Cases 6', 'Table of Cases 7', 'Table of Cases 8', 'Table of Cases 9', 'Table of Cases 10', 'Table of Cases 11', 'Table of Cases 12', 'Table of Cases 13', 'Table of Cases 14', 'Table of Cases 15', 'Table of Cases 16', 'Table of Cases 17', 'Table of Cases 18', 'Table of Cases 19', 'Table of Cases 20', 'Table of Cases 21', 'Table of Cases 22', 'Table of Cases 23', 'Table of Cases 24', 'Table of Cases 25', 'Table of Cases 26', 'Table of Cases 27', 'Table of Cases 28', 'Table of Cases 29', 'Table of Cases 30', 'Table of Cases 31', 'Table of Cases 32', 'Table of Cases 33', 'Table of Cases 34', 'Table of Cases 35', 'Table of Cases 36', 'Table of Cases 37', 'Table of Cases 38', 'Table of Cases 39', 'Table of Cases 40', 'Table of Cases 41', 'Table of Cases 42', 'Table of Cases 43', 'Table of Cases 44', 'Table of Cases 45', 'Table of Cases 46', 'Table of Cases 47', 'Table of Cases 48', 'Table of Cases 49', 'Table of Cases 50', 'Table of Cases 51', 'Table of Cases 52', 'Table of Cases 53', 'Table of Cases 54', 'Table of Cases 55', 'Table of Cases 56', 'Table of Cases 57', 'Table of Cases 58', 'Table of Cases 59', 'Table of Cases 60', 'Table of Cases 61', 'Table of Cases 62', 'Table of Cases 63', 'Table of Cases 64', 'Table of Cases 65', 'Table of Cases 66', 'Table of Cases 67', 'Table of Cases 68', 'Table of Cases 69', 'Table of Cases 70', 'Table of Cases 71', 'Table of Cases 72', 'Table of Cases 73', 'Table of Cases 74', 'Table of Cases 75', 'Table of Cases 76', 'Table of Cases 77', 'Table of Cases 78', 'Table of Cases 79', 'Table of Cases 80', 'Table of Cases 81', 'Table of Cases 82', 'Table of Cases 83', 'Table of Cases 84', 'Table of Cases 85', 'Table of Cases 86', 'Table of Cases 87', 'Table of Cases 88', 'Table of Cases 89', 'Table of Cases 90', 'Table of Cases 91', 'Table of Cases 92', 'Table of Cases 93', 'Table of Cases 94', 'Table of Cases 95', 'Table of Cases 96', 'Table of Cases 97', 'Table of Cases 98', 'Table of Cases 99', 'Table of Cases 100', 'Table of Cases 101', 'Table of Cases 102', 'Table of Cases 103', 'Table of Cases 104', 'Table of Cases 105', 'Table of Cases 106', 'Table of Cases 107', 'Table of Cases 108', 'Table of Cases 109', 'Table of Cases 110', 'Table of Cases 111', 'Table of Cases 112', 'Table of Cases 113', 'Table of Cases 114', 'Table of Cases 115', 'Table of Cases 116', 'Table of Cases 117', 'Table of Cases 118', 'Table of Cases 119', 'Table of Cases 120', 'Table of Cases 121', 'Table of Cases 122', 'Table of Cases 123', 'Table of Cases 124', 'Table of Statutes 1', 'Table of Statutes 2', 'Table of Statutes 3', 'Table of Statutes 4', 'Table of Statutes 5', 'Table of Statutes 6', 'Table of Statutes 7', 'Table of Statutes 8', 'Table of Statutes 9', 'Table of Statutes 10', 'Table of Statutes 11', 'Table of Statutes 12', 'Table of Statutes 13', 'Table of Statutes 14', 'Table of Statutes 15', 'Table of Statutes 16', 'Table of Statutes 17', 'Table of Statutes 18', 'Table of Statutes 19', 'Table of Statutes 20', 'Table of Statutes 21', 'Table of Statutes 22', 'Table of Statutes 23', 'Table of Statutes 24', 'Table of Statutes 25', 'Table of Statutes 26', 'Table of Statutes 27', 'Table of Statutes 28', 'Table of Statutes 29', 'Table of Statutes 30', 'Table of Statutes 31', 'Table of Statutes 32', 'Table of Statutes 33', 'Table of Statutes 34', 'Table of Statutes 35', 'Table of Statutes 36', 'Table of Statutes 37', 'Table of Statutes 38', 'Table of Statutory Instruments 1', 'Table of Statutory Instruments 2', 'Table of Statutory Instruments 3', 'Table of Statutory Instruments 4'], ['A1,130/3', 'A1,130/4'], ['A1,130/5'], ['A1,130/6'], ['A1,130/7'], ['A1,130/8'], ['A2,081', 'A2,082'], ['A2,082/1'], ['A2,082/2'], ['A2,082/3'], ['A2,082/4'], ['A4,023', 'A4,024'], ['A4,097', 'A4,098'], ['A4,098/1'], ['A4,098/2'], ['A4,132/1', 'A4,132/2', 'A4,132/3', 'A4,132/4'], ['A4,132/5'], ['A4,132/6'], ['A4,132/7'], ['A4,132/8'], ['A4,362/1', 'A4,362/2'], ['A4,391', 'A4,392'], ['A4,392/1'], ['A4,392/2'], ['A4,392/3'], ['A4,392/4'], ['A4,545', 'A4,546', 'A4,547', 'A4,548'], ['A4,549', 'A4,550', 'A4,550/1', 'A4,550/2', 'A4,550/3', 'A4,550/4'], ['A4,550/5'], ['A4,550/6'], ['A5,059', 'A5,060'], ['A5,060/1'], ['A5,060/2'], ['A5,060/3'], ['A5,060/4'], ['A5,060/5'], ['A5,060/6'], ['A5,060/7'], ['A5,060/8'], ['A6,061', 'A6,062', 'A6,063', 'A6,064', 'A6,065', 'A6,066', 'A6,067', 'A6,068', 'A6,069', 'A6,070', 'A6,071', 'A6,072', 'A6,073', 'A6,074'], ['A6,074/1'], ['A6,074/2'], ['A6,079', 'A6,080', 'A6,081', 'A6,082', 'A6,082/1', 'A6,082/2'], ['A6,147', 'A6,148'], ['A6,148/1'], ['A6,148/2'], ['A6,157', 'A6,158'], ['A8,099', 'A8,100', 'A8,101', 'A8,102', 'A8,103', 'A8,104', 'A8,105', 'A8,106', 'A8,107', 'A8,108', 'A8,109', 'A8,110', 'A8,111', 'A8,112', 'A8,113', 'A8,114', 'A8,115', 'A8,116', 'A8,117', 'A8,118', 'A8,119', 'A8,120', 'A8,121', 'A8,122', 'A8,123', 'A8,124', 'A8,125', 'A8,126', 'A8,127', 'A8,128', 'A8,129', 'A8,130', 'A8,131', 'A8,132', 'A8,133', 'A8,134', 'A8,135', 'A8,136', 'A8,137', 'A8,138', 'A8,139', 'A8,140', 'A8,141', 'A8,142', 'A8,143', 'A8,144', 'A8,145', 'A8,146', 'A8,147', 'A8,148'], ['A10,043', 'A10,044'], ['A10,044/1'], ['A10,044/2'], ['A10,044/3'], ['A10,044/4'], ['A10,044/5'], ['A10,044/6'], ['A10,044/7'], ['A10,044/8'], ['A10,044/9'], ['A10,044/10'], ['A10,049', 'A10,050'], ['A10,051', 'A10,052', 'A10,052/1', 'A10,052/2', 'A10,052/3', 'A10,052/4', 'A10,052/5', 'A10,052/6', 'A10,052/7', 'A10,052/8', 'A10,052/9', 'A10,052/10', 'A10,052/11', 'A10,052/12'], ['A10,052/13'], ['A10,052/14'], ['A10,115', 'A10,116'], ['A10,116/1'], ['A10,116/2'], ['A10,116/3'], ['A10,116/4'], ['A10,116/5'], ['A10,116/6'], ['B1,098/3', 'B1,098/4', 'B1,098/5', 'B1,098/6'], ['B1,098/7'], ['B1,098/8'], ['B1,120/2/1', 'B1,120/2/2', 'B1,120/2/3', 'B1,120/2/4', 'B1,120/2/5', 'B1,120/2/6'], ['B1,120/2/7'], ['B1,120/2/8'], ['B1,254/13', 'B1,254/14'], ['B1,254/15'], ['B1,254/16'], ['B1,345', 'B1,346'], ['B1,346/1'], ['B1,346/2'], ['B1,346/3'], ['B1,346/4'], ['B1,354/7', 'B1,354/8'], ['B1,404/1', 'B1,404/2'], ['B1,448/5', 'B1,448/6', 'B1,448/7', 'B1,448/8'], ['B1,448/9'], ['B1,448/10'], ['B1,459', 'B1,460', 'B1,460/1', 'B1,460/2', 'B1,460/3', 'B1,460/4'], ['B1,460/5'], ['B1,460/6'], ['B1,460/7'], ['B1,460/8'], ['B2,006/1', 'B2,006/2', 'B2,006/3', 'B2,006/4', 'B2,006/5', 'B2,006/6', 'B2,006/7', 'B2,006/8', 'B2,006/9', 'B2,006/10'], ['B2,006/11'], ['B2,006/12'], ['B2,014/2/9', 'B2,014/2/10', 'B2,014/2/11', 'B2,014/2/12', 'B2,014/2/13', 'B2,014/2/14', 'B2,014/2/15', 'B2,014/2/16', 'B2,014/2/17', 'B2,014/2/18', 'B2,014/2/19', 'B2,014/2/20', 'B2,014/2/21', 'B2,014/2/22', 'B2,014/2/23', 'B2,014/2/24'], ['B3,002/5', 'B3,002/6', 'B3,002/7', 'B3,002/8', 'B3,002/9', 'B3,002/10', 'B3,002/11', 'B3,002/12', 'B3,002/13', 'B3,002/14'], ['B3,002/15'], ['B3,002/16'], ['B3,009', 'B3,010'], ['B3,010/1'], ['B3,010/2'], ['B4,028/9', 'B4,028/10'], ['B4,028/11'], ['B4,028/12'], ['B6,036/1', 'B6,036/2', 'B6,036/3', 'B6,036/4', 'B6,036/5', 'B6,036/6', 'B6,036/7', 'B6,036/8'], ['B6,036/9'], ['B6,036/10'], ['B6,036/11'], ['B6,036/12'], ['B6,091', 'B6,092'], ['B6,092/1'], ['B6,092/2'], ['B6,092/3'], ['B6,092/4'], ['Index 1', 'Index 2', 'Index 3', 'Index 4', 'Index 5', 'Index 6', 'Index 7', 'Index 8', 'Index 9', 'Index 10', 'Index 11', 'Index 12', 'Index 13', 'Index 14', 'Index 15', 'Index 16', 'Index 17', 'Index 18', 'Index 19', 'Index 20', 'Index 21', 'Index 22', 'Index 23', 'Index 24', 'Index 25', 'Index 26', 'Index 27', 'Index 28', 'Index 29', 'Index 30', 'Index 31', 'Index 32', 'Index 33', 'Index 34', 'Index 35', 'Index 36', 'Index 37', 'Index 38', 'Index 39', 'Index 40', 'Index 41', 'Index 42', 'Index 43', 'Index 44', 'Index 45', 'Index 46', 'Index 47', 'Index 48', 'Index 49', 'Index 50', 'Index 51', 'Index 52', 'Index 53', 'Index 54', 'Index 55', 'Index 56', 'Index 57', 'Index 58', 'Index 59', 'Index 60', 'Index 61', 'Index 62', 'Index 63', 'Index 64']]

In [None]:
from collections import defaultdict

start_self = [
    'i', 'Service Information 1', 'Table of Cases 1', 'Table of Statutes 1', 'Table of Statutory Instruments 1',
    'A1,130/3', '', '', '', '', 'A2,081', '', '', '', '', 'A4,023', 'A4,097', '', '', 'A4,132/1', '', '', '', '', 'A4,362/1',
    'A4,391', '', '', '', '', 'A4,545', 'A4,549', '', '', 'A5,059', '', '', '', '', '', '', '', '', 'A6,061', '', '', 'A6,079',
    'A6,147', '', '', 'A6,157', 'A8,099', 'A10,043', '', '', '', '', '', '', '', '', '', '', 'A10,049', 'A10,051', '', '', 
    'A10,115', '', '', '', '', '', '', 'B1,098/3', '', '', 'B1,120/2/1', '', '', 'B1,254/13', '', '', 'B1,345', '', '', '', '', 
    'B1,354/7', 'B1,404/1', 'B1,448/5', '', '', 'B1,459', '', '', '', '', 'B2,006/1', '', '', 'B2,014/2/9', 'B3,002/5', '', '', 
    'B3,009', '', '', 'B4,028/9', '', '', 'B6,036/1', '', '', '', '', 'B6,091', '', '', '', '', 'Index 1'
]

start = [
    'i', 'Service Information 1', 'Table of Cases 1', 'Table of Statutes 1', 'Table of Statutory Instruments 1',
    'A1,130/3', 'A1,130/5', 'A1,130/6', 'A1,130/7', 'A1,130/8', 'A2,081', 'A2,082/1', 'A2,082/2', 'A2,082/3', 'A2,082/4',
    'A4,023', 'A4,097', 'A4,098/1', 'A4,098/2', 'A4,132/1', 'A4,132/5', 'A4,132/6', 'A4,132/7', 'A4,132/8', 'A4,362/1',
    'A4,391', 'A4,392/1', 'A4,392/2', 'A4,392/3', 'A4,392/4', 'A4,545', 'A4,549', 'A4,550/5', 'A4,550/6', 'A5,059', 'A5,060/1',
    'A5,060/2', 'A5,060/3', 'A5,060/4', 'A5,060/5', 'A5,060/6', 'A5,060/7', 'A5,060/8', 'A6,061', 'A6,074/1', 'A6,074/2',
    'A6,079', 'A6,147', 'A6,148/1', 'A6,148/2', 'A6,157', 'A8,099', 'A10,043', 'A10,044/1', 'A10,044/2', 'A10,044/3',
    'A10,044/4', 'A10,044/5', 'A10,044/6', 'A10,044/7', 'A10,044/8', 'A10,044/9', 'A10,044/10', 'A10,049', 'A10,051',
    'A10,052/13', 'A10,052/14', 'A10,115', 'A10,116/1', 'A10,116/2', 'A10,116/3', 'A10,116/4', 'A10,116/5', 'A10,116/6',
    'B1,098/3', 'B1,098/7', 'B1,098/8', 'B1,120/2/1', 'B1,120/2/7', 'B1,120/2/8', 'B1,254/13', 'B1,254/15', 'B1,254/16',
    'B1,345', 'B1,346/1', 'B1,346/2', 'B1,346/3', 'B1,346/4', 'B1,354/7', 'B1,404/1', 'B1,448/5', 'B1,448/9', 'B1,448/10',
    'B1,459', 'B1,460/5', 'B1,460/6', 'B1,460/7', 'B1,460/8', 'B2,006/1', 'B2,006/11', 'B2,006/12', 'B2,014/2/9', 'B3,002/5',
    'B3,002/15', 'B3,002/16', 'B3,009', 'B3,010/1', 'B3,010/2', 'B4,028/9', 'B4,028/11', 'B4,028/12', 'B6,036/1', 'B6,036/9',
    'B6,036/10', 'B6,036/11', 'B6,036/12', 'B6,091', 'B6,092/1', 'B6,092/2', 'B6,092/3', 'B6,092/4', 'Index 1'
]

mapping = defaultdict(list)
last_key = None

result = {}
last_key = None

for s_self, s in zip(start_self, start):
    if s_self != '':
        last_key = s_self
    if last_key is not None:
        result[last_key] = s
    else:
        # Edge case: if blanks at the start, you can skip or assign to a special key
        result[''] = s

# Print the mapping
for key, value in result.items():
    print(f"{key}: {value}")

In [None]:
end_self = ['x', 'Service Information 10', 'Table of Cases 124', 'Table of Statutes 38', 'Table of Statutory Instruments 4', 'A1,130/4', '', '', '', '', 'A2,082', '', '', '', '', 'A4,024', 'A4,098', '', '', 'A4,132/4', '', '', '', '', 'A4,362/2', 'A4,392', '', '', '', '', 'A4,548', 'A4,550/4', '', '', 'A5,060', '', '', '', '', '', '', '', '', 'A6,074', '', '', 'A6,082/2', 'A6,148', '', '', 'A6,158', 'A8,148', 'A10,044', '', '', '', '', '', '', '', '', '', '', 'A10,050', 'A10,052/12', '', '', 'A10,116', '', '', '', '', '', '', 'B1,098/6', '', '', 'B1,120/2/6', '', '', 'B1,254/14', '', '', 'B1,346', '', '', '', '', 'B1,354/8', 'B1,404/2', 'B1,448/8', '', '', 'B1,460/4', '', '', '', '', 'B2,006/10', '', '', 'B2,014/2/24', 'B3,002/14', '', '', 'B3,010', '', '', 'B4,028/10', '', '', 'B6,036/8', '', '', '', '', 'B6,092', '', '', '', '', 'Index 64']
end = ['x', 'Service Information 10', 'Table of Cases 124', 'Table of Statutes 38', 'Table of Statutory Instruments 4', 'A1,130/4', 'A1,130/5', 'A1,130/6', 'A1,130/7', 'A1,130/8', 'A2,082', 'A2,082/1', 'A2,082/2', 'A2,082/3', 'A2,082/4', 'A4,024', 'A4,098', 'A4,098/1', 'A4,098/2', 'A4,132/4', 'A4,132/5', 'A4,132/6', 'A4,132/7', 'A4,132/8', 'A4,362/2', 'A4,392', 'A4,392/1', 'A4,392/2', 'A4,392/3', 'A4,392/4', 'A4,548', 'A4,550/4', 'A4,550/5', 'A4,550/6', 'A5,060', 'A5,060/1', 'A5,060/2', 'A5,060/3', 'A5,060/4', 'A5,060/5', 'A5,060/6', 'A5,060/7', 'A5,060/8', 'A6,074', 'A6,074/1', 'A6,074/2', 'A6,082/2', 'A6,148', 'A6,148/1', 'A6,148/2', 'A6,158', 'A8,148', 'A10,044', 'A10,044/1', 'A10,044/2', 'A10,044/3', 'A10,044/4', 'A10,044/5', 'A10,044/6', 'A10,044/7', 'A10,044/8', 'A10,044/9', 'A10,044/10', 'A10,050', 'A10,052/12', 'A10,052/13', 'A10,052/14', 'A10,116', 'A10,116/1', 'A10,116/2', 'A10,116/3', 'A10,116/4', 'A10,116/5', 'A10,116/6', 'B1,098/6', 'B1,098/7', 'B1,098/8', 'B1,120/2/6', 'B1,120/2/7', 'B1,120/2/8', 'B1,254/14', 'B1,254/15', 'B1,254/16', 'B1,346', 'B1,346/1', 'B1,346/2', 'B1,346/3', 'B1,346/4', 'B1,354/8', 'B1,404/2', 'B1,448/8', 'B1,448/9', 'B1,448/10', 'B1,460/4', 'B1,460/5', 'B1,460/6', 'B1,460/7', 'B1,460/8', 'B2,006/10', 'B2,006/11', 'B2,006/12', 'B2,014/2/24', 'B3,002/14', 'B3,002/15', 'B3,002/16', 'B3,010', 'B3,010/1', 'B3,010/2', 'B4,028/10', 'B4,028/11', 'B4,028/12', 'B6,036/8', 'B6,036/9', 'B6,036/10', 'B6,036/11', 'B6,036/12', 'B6,092', 'B6,092/1', 'B6,092/2', 'B6,092/3', 'B6,092/4', 'Index 64']

result = {}
last_key = None

for s_self, s in zip(end_self, end):
    if s_self != '':
        last_key = s_self
    if last_key is not None:
        e_result[last_key] = s
    else:
        # Edge case: if blanks at the start, you can skip or assign to a special key
        e_result[''] = s

# Print the resulting mapping
for key, value in result.items():
    print(f"{key}: {value}")

In [1]:
blocks =  [['nan'], ['nan'], ['iii', 'iv', 'v', 'vi', 'vii', 'viii'], ['Service Information 1', 'Service Information 2', 'Service Information 3', 'Service Information 4', 'Service Information 5', 'Service Information 6'], ['Table of Cases 1', 'Table of Cases 2', 'Table of Cases 3', 'Table of Cases 4', 'Table of Cases 5', 'Table of Cases 6', 'Table of Cases 7', 'Table of Cases 8', 'Table of Cases 9', 'Table of Cases 10', 'Table of Cases 11', 'Table of Cases 12', 'Table of Cases 13', 'Table of Cases 14', 'Table of Cases 15', 'Table of Cases 16', 'Table of Cases 17', 'Table of Cases 18', 'Table of Cases 19', 'Table of Cases 20', 'Table of Cases 21', 'Table of Cases 22', 'Table of Cases 23', 'Table of Cases 24', 'Table of Cases 25', 'Table of Cases 26', 'Table of Cases 27', 'Table of Cases 28', 'Table of Cases 29', 'Table of Cases 30', 'Table of Cases 31', 'Table of Cases 32', 'Table of Cases 33', 'Table of Cases 34', 'Table of Cases 35', 'Table of Cases 36', 'Table of Cases 37', 'Table of Cases 38', 'Table of Cases 39', 'Table of Cases 40', 'Table of Cases 41', 'Table of Cases 42', 'Table of Cases 43', 'Table of Cases 44', 'Table of Cases 45', 'Table of Cases 46', 'Table of Cases 47', 'Table of Cases 48', 'Table of Cases 49', 'Table of Cases 50', 'Table of Cases 51', 'Table of Cases 52', 'Table of Cases 53', 'Table of Cases 54', 'Table of Cases 55', 'Table of Cases 56', 'Table of Cases 57', 'Table of Cases 58', 'Table of Cases 59', 'Table of Cases 60', 'Table of Cases 61', 'Table of Cases 62', 'Table of Cases 63', 'Table of Cases 64', 'Table of Cases 65', 'Table of Cases 66', 'Table of Cases 67', 'Table of Cases 68', 'Table of Cases 69', 'Table of Cases 70', 'Table of Cases 71', 'Table of Cases 72', 'Table of Cases 73', 'Table of Cases 74', 'Table of Cases 75', 'Table of Cases 76', 'Table of Cases 77', 'Table of Cases 78', 'Table of Cases 79', 'Table of Cases 80', 'Table of Cases 81', 'Table of Cases 82', 'Table of Cases 83', 'Table of Cases 84', 'Table of Cases 85', 'Table of Cases 86', 'Table of Cases 87', 'Table of Cases 88', 'Table of Cases 89', 'Table of Cases 90', 'Table of Cases 91', 'Table of Cases 92', 'Table of Cases 93', 'Table of Cases 94', 'Table of Cases 95', 'Table of Cases 96', 'Table of Cases 97', 'Table of Cases 98', 'Table of Cases 99', 'Table of Cases 100', 'Table of Cases 101', 'Table of Cases 102', 'Table of Cases 103', 'Table of Cases 104', 'Table of Cases 105', 'Table of Cases 106', 'Table of Cases 107', 'Table of Cases 108', 'Table of Cases 109', 'Table of Cases 110'], ['Table of Statutes 1', 'Table of Statutes 2', 'Table of Statutes 3', 'Table of Statutes 4', 'Table of Statutes 5', 'Table of Statutes 6', 'Table of Statutes 7', 'Table of Statutes 8', 'Table of Statutes 9', 'Table of Statutes 10', 'Table of Statutes 11', 'Table of Statutes 12', 'Table of Statutes 13', 'Table of Statutes 14', 'Table of Statutes 15', 'Table of Statutes 16', 'Table of Statutes 17', 'Table of Statutes 18', 'Table of Statutes 19', 'Table of Statutes 20', 'Table of Statutes 21', 'Table of Statutes 22', 'Table of Statutes 23', 'Table of Statutes 24', 'Table of Statutes 25', 'Table of Statutes 26', 'Table of Statutes 27', 'Table of Statutes 28', 'Table of Statutes 29', 'Table of Statutes 30', 'Table of Statutes 31', 'Table of Statutes 32', 'Table of Statutes 33', 'Table of Statutes 34', 'Table of Statutes 35', 'Table of Statutes 36', 'Table of Statutes 37', 'Table of Statutes 38', 'Table of Statutes 39', 'Table of Statutes 40', 'Table of Statutes 41', 'Table of Statutes 42', 'Table of Statutes 43', 'Table of Statutes 44', 'Table of Statutes 45', 'Table of Statutes 46', 'Table of Statutes 47', 'Table of Statutes 48', 'Table of Statutes 49', 'Table of Statutes 50'], ['Table of Statutory Instruments 1', 'Table of Statutory Instruments 2', 'Table of Statutory Instruments 3', 'Table of Statutory Instruments 4', 'Table of Statutory Instruments 5', 'Table of Statutory Instruments 6', 'Table of Statutory Instruments 7', 'Table of Statutory Instruments 8', 'Table of Statutory Instruments 9', 'Table of Statutory Instruments 10'], ['331', '332', '333', '334', '335', '336', '337', '338', '338/1', '338/2'], ['342/7', '342/8', '342/9', '342/10', '342/11', '342/12', '342/13', '342/14', '342/15', '342/16', '342/17', '342/18', '342/19', '342/20', '342/21', '342/22', '342/23', '342/24'], ['342/25'], ['342/26'], ['342/27'], ['342/28'], ['342/29'], ['342/30'], ['363', '364', '365', '366', '366/1', '366/2', '366/2/1', '366/2/2'], ['421', '422', '422/1', '422/2', '422/2/1', '422/2/2'], ['467', '468'], ['501', '502'], ['503', '504', '505', '506', '506/1', '506/2', '506/2/1', '506/2/2'], ['513', '514', '515', '516', '517', '518'], ['525', '526'], ['539', '540'], ['576/1', '576/2', '576/3', '576/4', '576/4/1', '576/4/2'], ['[TOC App-1]'], ['[TOC App-2]'], ['729', '730'], ['765', '766', '767', '768'], ['802/9', '802/10'], ['802/11'], ['802/12'], ['802/13'], ['802/14'], ['1027', '1028', '1029', '1030', '1031', '1032', '1033', '1034', '1035', '1036', '1037', '1038', '1039', '1040', '1041', '1042', '1043', '1044', '1045', '1046', '1047', '1048', '1049', '1050', '1051', '1052'], ['1053'], ['1054'], ['1055'], ['1056'], ['1057'], ['1058'], ['1059'], ['1060'], ['1061'], ['1062'], ['Index 1', 'Index 2', 'Index 3', 'Index 4', 'Index 5', 'Index 6', 'Index 7', 'Index 8', 'Index 9', 'Index 10', 'Index 11', 'Index 12', 'Index 13', 'Index 14', 'Index 15', 'Index 16', 'Index 17', 'Index 18', 'Index 19', 'Index 20', 'Index 21', 'Index 22', 'Index 23', 'Index 24', 'Index 25', 'Index 26', 'Index 27', 'Index 28', 'Index 29', 'Index 30', 'Index 31', 'Index 32', 'Index 33', 'Index 34', 'Index 35', 'Index 36', 'Index 37', 'Index 38', 'Index 39', 'Index 40', 'Index 41', 'Index 42', 'Index 43', 'Index 44', 'Index 45', 'Index 46', 'Index 47', 'Index 48', 'Index 49', 'Index 50', 'Index 51', 'Index 52', 'Index 53', 'Index 54', 'Index 55', 'Index 56', 'Index 57', 'Index 58', 'Index 59', 'Index 60', 'Index 61', 'Index 62', 'Index 63', 'Index 64', 'Index 65', 'Index 66', 'Index 67', 'Index 68', 'Index 69', 'Index 70', 'Index 71', 'Index 72', 'Index 73', 'Index 74', 'Index 75', 'Index 76', 'Index 77', 'Index 78', 'Index 79', 'Index 80', 'Index 81', 'Index 82', 'Index 83', 'Index 84', 'Index 85', 'Index 86']]

In [2]:
for block in blocks:
    print("Block:")
    for item in block:
        print(f"  - {item}")
    print()  # Print a newline for better readability

Block:
  - nan

Block:
  - nan

Block:
  - iii
  - iv
  - v
  - vi
  - vii
  - viii

Block:
  - Service Information 1
  - Service Information 2
  - Service Information 3
  - Service Information 4
  - Service Information 5
  - Service Information 6

Block:
  - Table of Cases 1
  - Table of Cases 2
  - Table of Cases 3
  - Table of Cases 4
  - Table of Cases 5
  - Table of Cases 6
  - Table of Cases 7
  - Table of Cases 8
  - Table of Cases 9
  - Table of Cases 10
  - Table of Cases 11
  - Table of Cases 12
  - Table of Cases 13
  - Table of Cases 14
  - Table of Cases 15
  - Table of Cases 16
  - Table of Cases 17
  - Table of Cases 18
  - Table of Cases 19
  - Table of Cases 20
  - Table of Cases 21
  - Table of Cases 22
  - Table of Cases 23
  - Table of Cases 24
  - Table of Cases 25
  - Table of Cases 26
  - Table of Cases 27
  - Table of Cases 28
  - Table of Cases 29
  - Table of Cases 30
  - Table of Cases 31
  - Table of Cases 32
  - Table of Cases 33
  - Table of Cases 34
  - T