In [None]:
import xml.etree.ElementTree as ET
import json

# Define instructions and response formats for queries

In [None]:
instructions = {
    "formula_calculation": (
        "The value must be calculated based on the exact numbers found in the source document. "
        "The value must be consolidated from the main company, excluding any segment or scenario-specific data. "
        "Ensure the correct context reference is used for the calculation (e.g., consolidated financials). "
        "Provide the result either as a numeric value or a percentage, depending on the financial ratio."
    ),
    "xbrl_tags": (
        "The answer must be the exact consolidated tag found in the source document. "
        "Also take any company-created or customized tags into consideration if they are relevant. "
        "Provide the final XBRL tag as it appears in the source."
    ),
    "value": (
        "The value must be consolidated from the main company and should not include values from any segments or specific scenarios. "
        "The answer must be the exact figure found in the source document. "
        "Provide the exact value and format it as a monetary value in $ billion. Do not round the figure."
    ),
    "formula_formatted_with_tags": (
        "The formula must consist of the relevant XBRL tags found in the source document. "
        "Take any company-created or customized tags into consideration if applicable. "
        "Provide the full formula according to the structure of the specific financial ratio, "
        "using the exact XBRL tags as they appear in the source."
    )
}

In [None]:
response_format = {
    "formula_calculation": "For numeric values: \"Answer: {number}\" For percentage values: \"Answer: {number}%\"",
    "xbrl_tags": "Answer: {XBRL tag}",
    "value": "Answer: $[value] billion",
    "formula_formatted_with_tags": "Answer: {Complete formula with XBRL tags}"
}

# Function to extract XBRL tags from XML files
"""
    Extracts XBRL tags and related context from XML files based on input queries.

    Args:
        queries (list): List of queries with associated metadata.

    Returns:
        list: Extracted and formatted results for queries.
    """

In [None]:
def extract_xbrl_tags(queries):
  results = []
  max_results = 10  # Limit the number of processed queries

  for idx, query in enumerate(queries):
    if len(results) >= max_results:
      break
    if query['category2'] != 'formula_calculation':
      continue

    doc_path = f"./DowJones30/{query['doc_path']}"
    print(f"Processing file: {doc_path}")
    tags = query.get('id', [])

    if not tags:
      continue

    try:
      tree = ET.parse(doc_path)
      root = tree.getroot()
    except FileNotFoundError:
      print(f"File not found: {doc_path}")
      continue
    except ET.ParseError as e:
      print(f"XML Parsing Error in {doc_path}: {e}")
      continue

    # Gather all elements with 'id' attributes
    id_elements = [(elem.get('id'), elem) for elem in root.iter() if elem.get('id')]

    extracted_data = []
    tag_set = set(tags)  # Create a set of target tags for fast lookup
    window_size = max(1, 100 // len(tags)) if tags else 0  # Set window size for related context

    # Extract relevant data within the context window
    for idx, (elem_id, elem) in enumerate(id_elements):
      if elem_id in tag_set:
        start = max(0, idx - window_size // 2)
        end = min(len(id_elements), idx + window_size // 2 + 1)
        for _, current_elem in id_elements[start:end]:
          extracted_data.append(format_element_data(current_elem))

    # Format the result if data is extracted
    if extracted_data:
      results.append({
        "id": len(results) + 1,
        "Query": query['query'],
        "Context": ''.join(extracted_data),
        "Additional Instructions": instructions[query['category2']],
        "Response Formats": response_format[query['category2']]
      })

  return results

# Helper function to format XML element data
"""
    Formats XML element data into a readable string.

    Args:
        elem (Element): XML element to format.

    Returns:
        str: Formatted string representation of the element.
    """

In [None]:
def format_element_data(elem):
  return (
    f"file:{elem.get('contextRef', 'None')}.xml\n"
    f"<{elem.tag} contextRef=\"{elem.get('contextRef', 'None')}\" "
    f"decimals=\"{elem.get('decimals', 'None')}\" "
    f"id=\"{elem.get('id', 'None')}\" "
    f"unitRef=\"{elem.get('unitRef', 'None')}\">{elem.text.strip() if elem.text else 'None'}</{elem.tag}>\n"
  )

# Load queries from a JSON file

In [None]:
with open('./XBRL.json', 'r') as file:
  queries = json.load(file)

# Extract XBRL tags and related context


In [None]:
formatted_results = extract_xbrl_tags(queries)

# Convert results to JSON format for saving or printing

In [None]:
formatted_json = json.dumps(formatted_results, indent=4)

# Save the results to a file

In [None]:
with open('./formula_calculation.json', 'w') as outfile:
  outfile.write(formatted_json)

In [None]:
print("Results have been saved to 'formula_calculation.json'")