In [1]:
from bs4 import BeautifulSoup
from bs4 import Tag
import json
import re
import os
import glob
from tqdm.std import tqdm

In [2]:
# extracting the text and table data
def extract_text(soup):
    extracted_data = {}
    positive_data = []
    negative_data = []
    seen_texts = set()

    entity_types = ['integerItemType','monetaryItemType','perShareItemType','percentItemType','sharesItemType']
    
    for element in soup.find_all(["p", "div", "span"]):
        text = element.get_text(separator=" ", strip=True) ## to get the content in the text or table

        # to set up the recursive_option parameter based on the content from the text or table, 
        # if it is from the table, the recursive_option is false, otherwise, it is true.
        inside_table_cell = element.find_parent(["td", "th"]) is not None
        if inside_table_cell:
            continue

        # looking up "ix:nonFraction"， and the name has to start with "us-gaap:"
        numeric_entities = []
        seen = {}

        # recursive_option is used to determine if looking up all child tags as well as the current tag "ix:nonfraction" in a line
        # if this line is from the text, we just obtain the "ix:nonfraction" in the current "ix:nonfraction" tag, so the recyrsive_option is flase
        # if this line is from the table, we should obtain all child and current "ix:nonfraction" tag
        for non_fraction in element.find_all("ix:nonfraction",recursive=False):

            # to ensure that it is the last layer of this tag "ix:nonfraction"
            if not non_fraction.find("ix:nonfraction"):
                concept = non_fraction.get("name", "None")
                value = non_fraction.text.strip().replace(",", "")

                if concept.startswith("us-gaap:"):  # to ensure the concept is from the us gaap**
                    if value != "": ## filter out some None string
                        if (value, concept) not in seen: ## drop duplication
                            seen[(value, concept)] = True

                            processed_concept = concept.split(":")[-1].strip()
                            entity_type = taxonomy[processed_concept]
                            if entity_type in entity_types:
                                numeric_entities.append({"value": value, "type":entity_type, "concept": concept})

        
        # Only add if (ix:nonFraction and us-gaap concept) is met
        if numeric_entities:
            if len(text) > 20 and text not in seen_texts:
                seen_texts.add(text)
                extracted_entry = {"text": text}
                extracted_entry["numeric_entities"] = numeric_entities
                positive_data.append(extracted_entry)
        else:
            if len(text) > 20 and text not in seen_texts:
                extracted_entry = {"text": text}
                extracted_entry["numeric_entities"] = []
                negative_data.append(extracted_entry)
    
    extracted_data = {"pos": positive_data, "neg": negative_data}
    return extracted_data




def extract_table(soup):
    extracted_data = {}
    positive_data = []
    negative_data = []
    seen_texts = set()
    entity_types = ['integerItemType','monetaryItemType','perShareItemType','percentItemType','sharesItemType']
    
    for table in soup.find_all("table"):  # Process each table as a whole
        table_rows = []
        numeric_entities = []
        seen = {}
        
        for table_row in table.find_all("tr", recursive=False):  # Ensure we only get direct children rows
            extracted_cells = []
            
            for cell in table_row.find_all(["th", "td"], recursive=False):  # Ensure original order
                extracted_text_parts = []
                
                for sub_element in cell.find_all(["p", "div", "span"], recursive=True):
                    extracted_text_parts.append(sub_element.get_text(separator=" ", strip=True))
                    
                    for non_fraction in sub_element.find_all("ix:nonfraction", recursive=True):
                        concept = non_fraction.get("name", "None")
                        value = non_fraction.text.strip().replace(",", "")
                        
                        if concept.startswith("us-gaap:") and value:
                            if value != "": ## filter out some None string
                                if (value, concept) not in seen: ## drop duplication
                                    seen[(value, concept)] = True
        
                                    processed_concept = concept.split(":")[-1].strip()
                                    entity_type = taxonomy[processed_concept]
                                    if entity_type in entity_types:
                                        numeric_entities.append({"value": value, "type":entity_type, "concept": concept})
                
                extracted_text = " ".join(extracted_text_parts) if extracted_text_parts else cell.get_text(separator=" ", strip=True)
                extracted_cells.append(f"<{cell.name}>{extracted_text}</{cell.name}>")
            
            table_rows.append(f"<tr>{''.join(extracted_cells)}</tr>")
        
        table_text = "".join(table_rows)
        
        # if table_text and table_text not in seen_texts:
        #     seen_texts.add(table_text)
        #     extracted_entry = {"text": f"<table>{table_text}</table>"}
        #     if numeric_entities:
        #         extracted_entry["numeric_entities"] = numeric_entities
        #         positive_data.append(extracted_entry)
        #     else:
        #         extracted_entry["numeric_entities"] = []
        #         negative_data.append(extracted_entry)

        if numeric_entities:
            if table_text and table_text not in seen_texts:
                seen_texts.add(table_text)
                extracted_entry = {"text": f"<table>{table_text}</table>"}
                extracted_entry["numeric_entities"] = numeric_entities
                positive_data.append(extracted_entry)
        else:
            if table_text and table_text not in seen_texts:
                extracted_entry = {"text": f"<table>{table_text}</table>"}
                extracted_entry["numeric_entities"] = []
                negative_data.append(extracted_entry)
    
    extracted_data = {"pos": positive_data, "neg": negative_data}
    return extracted_data

In [3]:
# loading the taxonomy dictionary
taxomony_path = "taxonomy/us-gaap-2024_taxonomy.json"
with open(taxomony_path, "r") as f:
    taxonomy = json.load(f)

In [4]:
filefolder = [os.path.join("report_data_for_bert/", d) for d in os.listdir("report_data_for_bert/")]

In [6]:
filefolder

['report_data_for_bert/0000766704-25-000009-xbrl',
 'report_data_for_bert/0000064803-25-000007-xbrl',
 'report_data_for_bert/0001141391-25-000011-xbrl',
 'report_data_for_bert/0001628280-25-005002-xbrl',
 'report_data_for_bert/.ipynb_checkpoints',
 'report_data_for_bert/0001013857-25-000024-xbrl',
 'report_data_for_bert/0000906163-25-000011-xbrl',
 'report_data_for_bert/0000732717-25-000013-xbrl',
 'report_data_for_bert/0000045012-25-000010-xbrl',
 'report_data_for_bert/0001522727-25-000010-xbrl',
 'report_data_for_bert/0000021175-25-000008-xbrl']

In [7]:
# reading the ixbrl file

text_datas = []
table_datas = []

for file in tqdm(filefolder):
    try:
        xsd_file = glob.glob(os.path.join(file, "*.xsd"))[0]
        base_name = os.path.splitext(os.path.basename(xsd_file))[0]
        htm_file = os.path.join(file, f"{base_name}.htm")
        try:    
            with open(htm_file, "r", encoding="utf-8") as file:
                content = file.read()
            # parsing HTML
            soup = BeautifulSoup(content, "lxml")
            text_data = extract_text(soup)
            table_data = extract_table(soup)
        
            text_datas.append(text_data)
            table_datas.append(table_data)
        except:
            print(htm_file)
    except:
        print(file)
        print("skip this folder")
        continue

  soup = BeautifulSoup(content, "lxml")
 36%|███▋      | 4/11 [00:10<00:14,  2.05s/it]

report_data_for_bert/.ipynb_checkpoints
skip this folder


100%|██████████| 11/11 [00:16<00:00,  1.54s/it]


In [8]:
with open("annotation/simple_data/for_bert_training_annotation.json", "w") as f:
    json.dump({
    "text_data": text_datas,
    "table_data": table_datas
}, f,indent=4, ensure_ascii=False)

## statistic entity type

In [9]:
len(text_datas)

10

In [10]:
len(table_datas)

10

In [8]:
entity_types = set()

In [9]:
for line in text_datas:
    for entity in line.get("numeric_entities"):
        entity_types.add(entity.get("type"))

In [10]:
entity_types

{'integerItemType',
 'monetaryItemType',
 'perShareItemType',
 'percentItemType',
 'sharesItemType'}