## Imports

In [1]:
import json
import pandas as pd
import numpy as np
import re

In [2]:
with open('../data/train.json', 'r') as file:
    data = json.load(file)

print(f"Loaded {len(data)} records from train.json.")

Loaded 3037 records from train.json.


In [3]:
# Preview the first data entry
data[0]


{'pre_text': ['26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 .',
  'all revenue components within the segment experienced growth during fiscal 2008 .',
  'license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing system aimed at larger credit unions , experienced strong sales throughout the year .',
  'support and service revenue , which is the largest component of total revenues for the credit union segment , experienced 34 percent growth in eft support and 10 percent growth in in-house support .',
  'gross profit in this business segment increased $ 9344 in fiscal 2008 compared to fiscal 2007 , due primarily to the increase in license revenue , which carries the highest margins .',
  'liquidity and capital resources we have historically generated positive cash flow from operations and have generally used funds generated from operations

In [7]:
def create_prompt(entry):
    # Get context: pre + post text + table
    pre_text = " ".join(entry.get('pre_text', []))
    post_text = " ".join(entry.get('post_text', []))
    table_data = entry.get('table_ori') or entry.get('table') or []
    table_str = "\n".join(["\t".join(row) for row in table_data if any(row)])

    context = f"Table:\n{table_str}\n\nPre-text: {pre_text}\n\nPost-text: {post_text}"

    # Case 1: Standard QA structure
    if 'qa' in entry:
        question = entry['qa'].get('question', '[No question]')
        answer = entry['qa'].get('answer', '[No answer]')
        steps = entry['qa'].get('steps', [])

        steps_str = ""
        for i, step in enumerate(steps):
            op = step['op']
            arg1 = step['arg1']
            arg2 = step['arg2']
            res = step['res']
            if "minus" in op:
                steps_str += f"{i+1}. subtract({arg1}, {arg2}) = {res}\n"
            elif "divide" in op:
                steps_str += f"{i+1}. divide({arg1}, {arg2}) = {res}\n"
            else:
                steps_str += f"{i+1}. {op}({arg1}, {arg2}) = {res}\n"

        prompt = (
            f"Question: {question}\n\n"
            f"Context:\n{context}\n\n"
            f"Steps:\n{steps_str}"
            f"Answer: {answer}"
        )

    # Case 2: Use dialogue_break and exe_ans_list if 'qa' is missing
    elif 'annotation' in entry and 'dialogue_break' in entry['annotation']:
        turns = entry['annotation']['dialogue_break']
        answers = entry['annotation'].get('exe_ans_list', [])

        dialogue_str = ""
        for i, (q, a) in enumerate(zip(turns, answers)):
            if isinstance(a, float):
                a_fmt = f"{a:.4f}" if 0 < a < 1 else str(a)
            else:
                a_fmt = str(a)
            dialogue_str += f"Q{i+1}: {q}\nA{i+1}: {a_fmt}\n"

        # The last question and answer become the main ones
        question = turns[-1]
        answer = answers[-1] if answers else "[No answer]"

        prompt = (
            f"Multi-turn Financial QA:\n{dialogue_str}\n"
            f"\nContext:\n{context}\n"
            f"\nFinal Answer: {answer}"
        )

    else:
        raise ValueError("No qa or dialogue_break found")

    return {
        "prompt": prompt,
        "question": question,
        "answer": str(answer)
    }


In [8]:
formatted_data = []

for i, entry in enumerate(data):
    try:
        formatted = create_prompt(entry)
        formatted_data.append(formatted)
    except Exception as e:
        print(f"❌ Entry {i} skipped: {e}")



In [9]:
with open('../data/finqa_train_prompts.json', 'w') as f:
    json.dump(formatted_data, f, indent=4)

print("Formatted prompts saved successfully.")


Formatted prompts saved successfully.
