# 1. Data Preprocessing

In [1]:
# preprocess.py
# 
# This script processes the ATIS dataset into a unified format (general datasets which will suit for all tasks)
# suitable for classification, tagging, generation, and prompting models.
# Outputs:
#  - processed/question_{train,dev,test}.jsonl    (records for classification & generation)
#  - processed/query_{train,dev,test}.jsonl       (records for query-split classification)
#  - processed/generation_{train,dev,test}.jsonl  (input-output pairs for seq2seq prompting)
#  - processed/templates.json   (maps template_id -> SQL template with placeholders)
#  - processed/tags_vocab.json  (list of all tag labels: 'O' plus placeholder names)
#  - processed/default_values.json (default values per template for missing tags)

In [2]:
import json
import os
import re
from collections import OrderedDict, defaultdict

# ---------- Configuration ----------
INPUT_FILE = './sources/atis.json'          # path to raw ATIS JSON
OUTPUT_DIR = 'datasets'          # output folder for all downstream files
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------- Step 1: Load Raw Data ----------
# Read the ATIS dataset into memory
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    data = json.load(f)

# ---------- Step 2: Extract Unique SQL Templates ----------
# We select the shortest SQL per question-group, breaking ties alphabetically
template_to_id = OrderedDict()  # preserves insertion order
id_to_template = {}             # inverse mapping: id -> template string
next_template_id = 0

# Container to hold one record per sentence
records = []

for entry in data:
    sql_group = entry['sql']
    # choose the shortest SQL; if equal length, pick lexicographically first
    shortest_sql = min(sql_group, key=lambda s: (len(s), s))
    # assign a unique ID to each new template
    if shortest_sql not in template_to_id:
        template_to_id[shortest_sql] = next_template_id
        id_to_template[next_template_id] = shortest_sql
        next_template_id += 1
    tid = template_to_id[shortest_sql]
    query_split = entry.get('query-split', '')

    # process each sentence in this SQL group
    for sent in entry['sentences']:
        raw_text = sent['text']
        variables = sent['variables']      # mapping placeholder -> actual value
        question_split = sent.get('question-split', '')

        # ---------- Replace placeholders in the question text ----------
        # e.g., "list flights to airport_code0" -> "list flights to DAL"
        filled_text = raw_text
        for placeholder, real_val in variables.items():
            # use re.escape to safely handle special characters
            filled_text = re.sub(rf"\b{re.escape(placeholder)}\b", real_val, filled_text)

        # ---------- Tokenize the filled text ----------
        # simple whitespace tokenization; can be swapped for a more robust tokenizer if needed
        tokens = filled_text.split()

        # ---------- Generate BIO-style tags (here simplified as direct labels) ----------
        # Tag each token: 'O' for outside any variable, or the placeholder name
        tags = ['O'] * len(tokens)
        for placeholder, real_val in variables.items():
            val_tokens = real_val.split()
            # sliding-window match to label multi-token values
            for i in range(len(tokens) - len(val_tokens) + 1):
                if tokens[i:i + len(val_tokens)] == val_tokens:
                    for j in range(len(val_tokens)):
                        tags[i + j] = placeholder

        # ---------- Fill the SQL template with actual values ----------
        # Prepare the fully instantiated SQL for generation and evaluation
        sql_filled = shortest_sql
        for placeholder, real_val in variables.items():
            sql_filled = sql_filled.replace(placeholder, real_val)

        # ---------- Assemble the record ----------
        record = {
            'text': filled_text,
            'text_tokens': tokens,
            'tags': tags,
            'template_sql': shortest_sql,
            'template_id': tid,
            'sql_with_vars_filled': sql_filled,
            'variables': variables,
            'question_split': question_split,
            'query_split': query_split
        }
        records.append(record)

# ---------- Step 3: Build Tag Vocabulary ----------
# Collect all unique tags across the dataset
all_tags = set()
for rec in records:
    all_tags.update(rec['tags'])
tags_vocab = sorted(all_tags)
# Save the tag list to JSON
with open(os.path.join(OUTPUT_DIR, 'tags_vocab.json'), 'w', encoding='utf-8') as f:
    json.dump(tags_vocab, f, indent=2, ensure_ascii=False)

# ---------- Step 4: Compute Default Values per Template ----------
# For each template_id, record the first seen variable value in the TRAIN split
default_values = {}
for rec in records:
    if rec['question_split'] == 'train':
        tid = rec['template_id']
        default_values.setdefault(str(tid), {})
        for placeholder, real_val in rec['variables'].items():
            if placeholder not in default_values[str(tid)]:
                default_values[str(tid)][placeholder] = real_val
# Save defaults to JSON (keys are strings for template_id)
with open(os.path.join(OUTPUT_DIR, 'default_values.json'), 'w', encoding='utf-8') as f:
    json.dump(default_values, f, indent=2, ensure_ascii=False)

# ---------- Step 5: Write Out JSONL Splits ----------
# Prepare writers for question-split, query-split, and generation files
splits = ['train', 'dev', 'test']
question_writers = {sp: open(os.path.join(OUTPUT_DIR, f'question_{sp}.jsonl'), 'w', encoding='utf-8') for sp in splits}
query_writers    = {sp: open(os.path.join(OUTPUT_DIR, f'query_{sp}.jsonl'),   'w', encoding='utf-8') for sp in splits}
generation_writers = {sp: open(os.path.join(OUTPUT_DIR, f'generation_{sp}.jsonl'), 'w', encoding='utf-8') for sp in splits}

for rec in records:
    qsp = rec['question_split']
    gsp = rec['query_split']
    # Write to question-split files (for classification & generation)
    if qsp in question_writers:
        question_writers[qsp].write(json.dumps(rec, ensure_ascii=False) + '\n')
    # Write to query-split files (for query-based classification)
    if gsp in query_writers:
        query_writers[gsp].write(json.dumps(rec, ensure_ascii=False) + '\n')
    # Write input-output pairs for generation/prompting
    if qsp in generation_writers:
        pair = {'input': rec['text'], 'output': rec['sql_with_vars_filled']}
        generation_writers[qsp].write(json.dumps(pair, ensure_ascii=False) + '\n')

# Close all file handles
for w in list(question_writers.values()) + list(query_writers.values()) + list(generation_writers.values()):
    w.close()

# ---------- Step 6: Save Templates Mapping ----------
# Write template_id -> template SQL (with placeholders) mapping
with open(os.path.join(OUTPUT_DIR, 'templates.json'), 'w', encoding='utf-8') as f:
    json.dump({str(k): v for k, v in id_to_template.items()}, f, indent=2, ensure_ascii=False)

print('✔️ Preprocessing complete. Outputs are in:', OUTPUT_DIR)


✔️ Preprocessing complete. Outputs are in: datasets
