# Notebook 2: Generate Training Data

## Goals
1. Read all cleaned CSVs from `../balance_sheet_clean_label` and `../income_statement_clean_label`
2. Generate context windows for each row (2 previous items, 2 next items)
3. Format input strings: `[PREV_2] [PREV_1] [SECTION] [RAW_NAME] [NEXT_1] [NEXT_2]`
4. Format target strings: `standardized_name, is_calculated`
5. Save as `training_data.jsonl`

In [27]:
import pandas as pd
from pathlib import Path
import json

# Paths
base_path = Path('..')
bs_clean_path = base_path / 'balance_sheet_clean_label'
is_clean_path = base_path / 'income_statement_clean_label'
output_path = base_path / 'data' / 'training_data.jsonl'

# Create output directory if needed
output_path.parent.mkdir(parents=True, exist_ok=True)

print("Directories set up.")

Directories set up.


In [28]:
# Load all clean CSVs
all_files = list(bs_clean_path.glob('*.csv')) + list(is_clean_path.glob('*.csv'))
print(f"Found {len(all_files)} files to process.")

dataframes = []
for f in all_files:
    df = pd.read_csv(f)
    df['source_file'] = f.name
    dataframes.append(df)    

print(f"Loaded {len(dataframes)} dataframes.")

Found 92 files to process.
Loaded 92 dataframes.


In [29]:
def create_context_string(row_idx, df):
    """
    Creates the input string for a given row index.
    Format: [PREV_2] [PREV_1] [SECTION] [RAW_NAME] [NEXT_1] [NEXT_2]
    """
    # Current item
    row = df.iloc[row_idx]
    section = row['section']
    raw_name = str(row['row_name']).strip()
    
    # Previous items
    if row_idx >= 1:
        prev1 = str(df.iloc[row_idx-1]['row_name']).strip()
    else:
        prev1 = "<START>"
        
    if row_idx >= 2:
        prev2 = str(df.iloc[row_idx-2]['row_name']).strip()
    else:
        prev2 = "<START>"
        
    # Next items
    if row_idx + 1 < len(df):
        next1 = str(df.iloc[row_idx+1]['row_name']).strip()
    else:
        next1 = "<END>"
        
    if row_idx + 2 < len(df):
        next2 = str(df.iloc[row_idx+2]['row_name']).strip()
    else:
        next2 = "<END>"

    # Construct string
    input_str = f"[{prev2}] [{prev1}] [{section}] [{raw_name}] [{next1}] [{next2}]"
    return input_str

In [30]:
# Generate Training Data
training_rows = []
for df in dataframes:
    # Ensure we sort if needed, but CSVs are likely in order
    # df = df.sort_index() 
    
    for i in range(len(df)):
        input_text = create_context_string(i, df)
        
        # Target: Just standardized_name
        target_text = str(df.iloc[i]['standardized_name']).strip()
        
        training_rows.append({
            "input": input_text,
            "output": target_text,
            "metadata": {
                "source": df.iloc[i]['source_file'],
                # We keep is_calculated in metadata so we can build a lookup table later
                "is_calculated": bool(df.iloc[i]['is_calculated'])
            }
        })
        
print(f"Generated {len(training_rows)} training examples.")
print("Sample Input: ", training_rows[0]['input'])
print("Sample Output:", training_rows[0]['output'])

Generated 2079 training examples.
Sample Input:  [<START>] [<START>] [current_assets] [Cash and cash equivalents] [Marketable securities (current)] [Accounts receivable, net]
Sample Output: cash_and_equivalents


In [31]:
# OPTIONAL: Remove Header Rows
# Uncomment the lines below to exclude any row ending in '_header'
# from the training set.

print(f"Rows before filtering headers: {len(training_rows)}")
training_rows = [
    row for row in training_rows 
    if not row['output'].endswith('_header')
]
print(f"Rows after filtering headers: {len(training_rows)}")

Rows before filtering headers: 2079
Rows after filtering headers: 2009


In [32]:
# Save to JSONL
with open(output_path, 'w', encoding='utf-8') as f:
    for row in training_rows:
        f.write(json.dumps(row) + '\n')
        
print(f"✓ Saved to {output_path}")

✓ Saved to ..\data\training_data.jsonl
