In [1]:
import json
import pandas as pd
from anytree import Node

In [2]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

In [3]:
data = read_jsonl(
    'tree_text_sample_16032023/0c2cf15bf82ae16d05ba78dc3c64b36cb87cc263.jsonl')

In [10]:
def create_tree(data):
    root = Node('Root')  # Create root node
    for i, entry in enumerate(data):
        Node(f'Child_{i}', parent=root, data=entry)  # Create child node with data
    return root

tree = create_tree(data)

In [11]:
print(tree)

Node('/Root')


In [8]:
print(tree.children[2].data)  # Print data of the first child node

[{'page_count': 3, 'block_report_level_count': 81, 'block_page_level_count': 0, 'block_report_level_xy_cut_count': -1, 'block_page_level_xy_cut_count': -1, 'block_number': 2, 'block_type': 1, 'image_width': 932, 'image_height': 977, 'size': 0.0, 'font': '', 'color': 0, 'page_height': 576.0, 'page_width': 720.0, 'bboxes': [], 'bbox': [0, 52.961639404296875, 464.5950927734375, 541.120849609375], 'original_bbox': [-1.070294737815857, 52.961639404296875, 464.5950927734375, 541.120849609375], 'adescenders': None, 'texts': [], 'text_str': '', 'divided_block': False, 'parent_id': 'b72', 'children_ids': [], 'self_id': 'i6', 'tree_level': 5, 'type': 'image', 'as_child_score': 1, 'as_parent_scores': [], 'children_nodes': [], 'summarization_for_tree': '', 'outside_pps_type': '', 'inside_pps_type': [], 'overlap_pps_type': '', 'table_original_block_json': None, 'table_recognition_result': None, 'links': []}, {'page_count': 3, 'block_report_level_count': 82, 'block_page_level_count': 1, 'block_repor

In [12]:
print(len(tree.children))

73


In [14]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            json_line = json.loads(line.strip())
            if isinstance(json_line, list):
                data.extend(json_line)
            else:
                data.append(json_line)
    return data

In [15]:
data = read_jsonl('tree_text_sample_16032023/0c2cf15bf82ae16d05ba78dc3c64b36cb87cc263.jsonl')
id_to_node = {node['self_id']: node for node in data}

def recursively_build_tree(node):
    children = [id_to_node[child_id] for child_id in node.get('children_ids', [])]
    node['children_nodes'] = [recursively_build_tree(child) for child in children]

    if node['type'] == 'text':
        node['name'] = node['text_str']
    elif node['type'] == 'image':
        node['name'] = 'image_{}_{}'.format(node['image_width'], node['image_height'])
    else:
        node['name'] = node['type']

    return node

root_node = next(node for node in data if node['type'] == 'root')
root_node = recursively_build_tree(root_node)

In [16]:
def print_tree(node, indent=0):
    print('  ' * indent + node['name'])
    for child in node.get('children_nodes', []):
        print_tree(child, indent+1)

print_tree(root_node)

root
  image_1373_419
  ALLETE
  Sustainability Report 2020
  1
    INTRODUCTION
    CLIMATE ENVIRONMENTAL MANAGEMENT ENERGY EFFICIENCY SECURITY RELIABILITY CULTURE & ENGAGEMENT CORPORATE GOVERNANCE SASB 2019 EEI REPORT 2019 STATEMENTS REFERENCES
  Introduction ..................................................................................................2
  Task Force on Climate-related Financial Disclosures (TCFD) ............4
    Governance and Management ...................................................................................
    6
    Strategy ..............................................................................................................................
    7
    Risk Management ..........................................................................................................
    13
    Metrics and Targets ......................................................................................................
    28
  Environmental Management Sy

In [19]:
from anytree import Node, RenderTree

def build_anytree(node):
    children = [build_anytree(child_node) for child_node in node.get('children_nodes', [])]
    name = node.get('text_str', '')
    if node.get('type') == 'root':
        name = 'root'
    return Node(name, children=children)

root_anytree_node = build_anytree(root_node)
for pre, fill, node in RenderTree(root_anytree_node):
    print("%s%s" % (pre, node.name))

root
├── 
├── ALLETE
├── Sustainability Report 2020
├── 1
│   ├── INTRODUCTION
│   └── CLIMATE ENVIRONMENTAL MANAGEMENT ENERGY EFFICIENCY SECURITY RELIABILITY CULTURE & ENGAGEMENT CORPORATE GOVERNANCE SASB 2019 EEI REPORT 2019 STATEMENTS REFERENCES
├── Introduction ..................................................................................................2
├── Task Force on Climate-related Financial Disclosures (TCFD) ............4
│   ├── Governance and Management ...................................................................................
│   ├── 6
│   ├── Strategy ..............................................................................................................................
│   ├── 7
│   ├── Risk Management ..........................................................................................................
│   ├── 13
│   ├── Metrics and Targets ......................................................................................................
│ 

In [20]:
def build_tree(data: list) -> dict:
    """
    Build a tree structure from the list of dictionaries.

    :param data: A list of lists where each sublist contains a dictionary with node information.
    :return: A dictionary mapping node id to node object.
    """
    id_to_node = {}

    # First, create nodes
    for sublist in data:
        for node_dict in sublist:
            node_id = node_dict['self_id']
            node_text = node_dict.get('text_str', "")[:100]
            
            # Skip nodes with empty "text_str" unless they are the root node
            if node_id != 'r0' and node_text == "":
                print(f"Skipping node {node_id} because it has an empty 'text_str'")
                continue
            
            id_to_node[node_id] = Node(node_id, parent=None, text_str=node_text)

    # Second, link nodes to their parents
    for sublist in data:
        for node_dict in sublist:
            node_id = node_dict['self_id']
            parent_id = node_dict['parent_id']

            # Skip if the node or its parent were skipped earlier
            if node_id not in id_to_node:
                print(f"Skipping node {node_id} because it was not included in the first pass")
                continue
            if parent_id and parent_id not in id_to_node:
                print(f"Skipping node {node_id} because its parent {parent_id} was not included in the first pass")
                continue
            
            node = id_to_node[node_id]
            if parent_id:
                node.parent = id_to_node[parent_id]

    return id_to_node
