In [3]:
import json
import re
import pandas as pd

# Load your input JSON file
with open("//Users/pinar.buke/Desktop/ConvFinQA-main/financial-qa-app/notebooks/finqa_train_prompts.json", "r") as f:
    data = json.load(f)

def normalize_number(value):
    """Convert strings like '$1,234.56%' to float."""
    if isinstance(value, str):
        value = value.replace('$', '').replace('%', '').replace(',', '').strip()
    try:
        return float(value)
    except:
        return value  # return original if not a number

def clean_prompt(prompt):
    """Clean prompt text for easier viewing."""
    return prompt.replace("\n", " ").strip()

def extract_question(prompt_text, fallback_question):
    """Try to extract question from prompt if present."""
    match = re.search(r"Question:\s*(.*?)\n", prompt_text, re.IGNORECASE)
    return match.group(1).strip() if match else fallback_question

def preprocess_prompt_entry(entry):
    return {
        "question": extract_question(entry.get("prompt", ""), entry.get("question", "")),
        "answer": normalize_number(entry.get("answer", "")),
        "raw_answer": entry.get("answer", ""),
        "cleaned_prompt": clean_prompt(entry.get("prompt", ""))
    }

# Apply preprocessing
processed = [preprocess_prompt_entry(e) for e in data]
df = pd.DataFrame(processed)

# Save to CSV and JSON
df.to_csv("preprocessed_finqa_prompts.csv", index=False)
with open("preprocessed_finqa_prompts.json", "w") as f:
    json.dump(processed, f, indent=2)

print(f"✅ Done! Saved {len(processed)} entries to CSV and JSON.")


✅ Done! Saved 3037 entries to CSV and JSON.


In [5]:
from datasets import Dataset
import json
import os

# Path to your preprocessed JSON file
json_path = "/Users/pinar.buke/Desktop/ConvFinQA-main/financial-qa-app/notebooks/preprocessed_finqa_prompts.json"

# Output directories
base_dir = "/Users/pinar.buke/Desktop/ConvFinQA-main/financial-qa-app/notebooks/"
full_dataset_path = os.path.join(base_dir, "huggingface_finqa_dataset")
train_dataset_path = os.path.join(base_dir, "huggingface_finqa_train")
test_dataset_path = os.path.join(base_dir, "huggingface_finqa_test")

# Step 1: Load JSON
with open(json_path, "r") as f:
    raw_data = json.load(f)

# Step 2: Convert all values to strings to avoid Arrow typing issues
def convert_all_to_strings(example):
    return {k: str(v) if not isinstance(v, str) else v for k, v in example.items()}

stringified_data = [convert_all_to_strings(e) for e in raw_data]

# Step 3: Create Hugging Face Dataset
dataset = Dataset.from_list(stringified_data)

# Step 4: View a sample
print("Sample entry:\n", dataset[0])

# Step 5: Save full dataset
dataset.save_to_disk(full_dataset_path)
print(f"✅ Full dataset saved to {full_dataset_path}")

# Step 6: Optional Train/Test split
split_dataset = dataset.train_test_split(test_size=0.1)
split_dataset["train"].save_to_disk(train_dataset_path)
split_dataset["test"].save_to_disk(test_dataset_path)

print(f"✅ Train dataset saved to {train_dataset_path}")
print(f"✅ Test dataset saved to {test_dataset_path}")



Sample entry:
 {'question': 'what was the percentage change in the net cash from operating activities from 2008 to 2009', 'answer': '14.1', 'raw_answer': '14.1%', 'cleaned_prompt': 'Question: what was the percentage change in the net cash from operating activities from 2008 to 2009  Context: Table: \tYear ended June 30, 2009 2008\t2007 Net income\t$103,102\t$104,222\t$104,681 Non-cash expenses\t74,397\t70,420\t56,348 Change in receivables\t21,214\t(2,913)\t(28,853) Change in deferred revenue\t21,943\t5,100\t24,576 Change in other assets and liabilities\t(14,068)\t4,172\t17,495 Net cash from operating activities\t$206,588\t$181,001\t$174,247  Pre-text: 26 | 2009 annual report in fiscal 2008 , revenues in the credit union systems and services business segment increased 14% ( 14 % ) from fiscal 2007 . all revenue components within the segment experienced growth during fiscal 2008 . license revenue generated the largest dollar growth in revenue as episys ae , our flagship core processing s

Saving the dataset (0/1 shards):   0%|          | 0/3037 [00:00<?, ? examples/s]

✅ Full dataset saved to /Users/pinar.buke/Desktop/ConvFinQA-main/financial-qa-app/notebooks/huggingface_finqa_dataset


Saving the dataset (0/1 shards):   0%|          | 0/2733 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/304 [00:00<?, ? examples/s]

✅ Train dataset saved to /Users/pinar.buke/Desktop/ConvFinQA-main/financial-qa-app/notebooks/huggingface_finqa_train
✅ Test dataset saved to /Users/pinar.buke/Desktop/ConvFinQA-main/financial-qa-app/notebooks/huggingface_finqa_test
