In [None]:
# ============================================================================
# PARAPHRASING AUGMENTATION PIPELINE (Model-Agnostic)
# ============================================================================

# STEP 1: Load Input Data
# ----------------------------------------------------------------------------
Input: Alpaca-format JSON file ('instruction' key were cleaned first for simplicity)
  - Each item: { "id": str, "input": str, "output": str, ... }
  - output field contains JSON: {"<CATEGORY_NAME>": ["<SPAN_1>", "<SPAN_2>", ...]}

Load alpaca_data = json.load(input_file)


# STEP 2: Extract Context Window for Each Span
# ----------------------------------------------------------------------------
For each item in alpaca_data:
    original_text = item["input"]
    output_data = json.loads(item["output"])
    
    For each category_key, span_list in output_data.items():
        For each span in span_list:
            # Find span location in text (case-insensitive)
            span_match = find_span_in_text(original_text, span)
            
            # Extract context window: N words before + span + N words after
            context_window = extract_context_window(
                text=original_text,
                span=span,
                window_size=WINDOW_SIZE  # e.g., 20 words, if less than 20 words, then take all words available
            )
            # Result: "<CONTEXT_BEFORE> <SPAN> <CONTEXT_AFTER>"


# STEP 3: Build Paraphrasing Prompt
# ----------------------------------------------------------------------------
System Prompt Template:
"""
You are assisting in controlled paraphrasing of de-identified clinical text. 
Your goal is to rephrase the surrounding context of a verified indicator span 
while keeping the indicator phrase itself verbatim and preserving the clinical meaning of the entire segment (span + surrounding context). 
Do not invent new clinical information or identifiers.
"""
# ----------------------------------------------------------------------------
User Prompt Template:
"""
Below is a clinical note and a specific segment extracted from it that needs paraphrasing.

FULL NOTE (for context reference):
<FULL_NOTE_TEXT>
TARGET SEGMENT:
Original Context Window:
<CONTEXT_BEFORE> <SPAN> <CONTEXT_AFTER>
Target Span to Keep Unchanged: "<SPAN>"

Use the full note to maintain clinical coherence, but paraphrase only the surrounding context (before and after the span) in the target segment above. Keep the target span verbatim.
Now generate <AUG_INSTANCE_#> paraphrased versions of the surrounding context. Follow the same output format.
Rewritten Context Window:
"""
# ----------------------------------------------------------------------------

prompt = build_prompt(context_window, span)


# STEP 4: Model Interface
# ----------------------------------------------------------------------------
For each span, generate num_augmentations_per_span versions:
    For aug_num in range(num_augmentations_per_span):
        # Call model
        model_response = call_model(
            prompt=prompt,
            model_config={
                "temperature": 0.4,
                "top_k": 32,
                "max_tokens": 1024
            }
        )
        
        paraphrased_context = extract_text_from_response(model_response)


# STEP 5: Validate & Process Model Output
# ----------------------------------------------------------------------------
# Validation checks:
if span not in paraphrased_context:
    skip this augmentation
    
# Replace original context window in full text
augmented_text = original_text.replace(
    old=original_context_window,
    new=paraphrased_context,
    count=1  # Replace first occurrence only
)

# Verify change occurred
if augmented_text == original_text:
    skip this augmentation


# STEP 6: Create Augmented Item
# ----------------------------------------------------------------------------
new_item = {
    "input": augmented_text,  # Full note with paraphrased context
    "output": item["output"],  # Original output unchanged
    "id": f"{base_id}_aug_s{span_index}_a{aug_num+1}",
    "base_item_id": base_id,
    # ... other fields from original item
}


# STEP 7: Output Format
# ----------------------------------------------------------------------------
Output: JSON file containing only augmented items
  - Each augmented item is a complete Alpaca-format instance
  - Original items are NOT included in output

In [None]:
# ============================================================================
# SYNTHETIC GENERATION PIPELINE (Two-Round Cascaded Workflow)
# ============================================================================

# STEP 1: Load Input Data
# ----------------------------------------------------------------------------
Input: Stage 2 corpus (Alpaca-format JSON file)
  - Each item: { "id": str, "input": str, "output": str, ... }
  - output field contains JSON: {"<CATEGORY_NAME>": ["<SPAN_1>", "<SPAN_2>", ...]}

Load stage2_data = json.load(input_file)

# Group instances by category
category_instances = group_by_category(stage2_data)
# Result: { "<CATEGORY_NAME>": [item1, item2, ...], ... }


# STEP 2: Round 1 - Generation
# ----------------------------------------------------------------------------
For each category in category_instances:
    category_name = category
    category_def = load_category_definition(category_name)  # From Appendix 4-Table 2
    
    For each request in Round 1:
        # Randomly select 3 demonstrations from Stage 2 corpus for this category
        demonstrations = random.sample(
            category_instances[category_name], 
            k=3
        )
        
        # Build prompt with demonstrations
        prompt = build_synthetic_prompt(
            category_name=category_name,
            category_definition=category_def,
            demonstrations=demonstrations,
            num_instances=K  # Generate K candidates per request
        )
        
        # Call model
        model_response = call_model(prompt=prompt)
        
        # Parse response to extract synthetic instances
        synthetic_candidates_R1 = parse_synthetic_response(model_response)
        # Result: List of { "input": str, "output": str } pairs


# STEP 3: Round 1 - Validation
# ----------------------------------------------------------------------------
For each candidate in synthetic_candidates_R1:
    # Apply validation checks (if any)
    if passes_validation(candidate):
        validation_queue_R1.append(candidate)

# Manual Validation (by 2 HIPAA-trained reviewers)
For each candidate in validation_queue_R1:
    reviewer1_decision = manual_validate(candidate)
    reviewer2_decision = manual_validate(candidate)
    
    if reviewer1_decision == "pass" and reviewer2_decision == "pass":
        verified_seeds_R1.append(candidate)
        candidate["id"] = f"{category_name}_syn_R1_{len(verified_seeds_R1)}"
        candidate["round"] = 1
        candidate["category"] = category_name


# STEP 4: Round 2 - Expansion
# ----------------------------------------------------------------------------
For each category:
    # Use verified seeds from Round 1 as demonstrations
    seeds_for_round2 = verified_seeds_R1[category]
    
    For each request in Round 2:
        # Select seed(s) for this request (use verified seeds as demonstrations)
        selected_seeds = random.sample(seeds_for_round2, k=min(3, len(seeds_for_round2)))
        
        # Build prompt
        prompt = build_synthetic_prompt(
            category_name=category_name,
            category_definition=category_def,
            demonstrations=selected_seeds,
            num_instances=K
        )
        
        # Call model
        model_response = call_model(prompt=prompt)
        
        # Parse response to extract synthetic instances
        synthetic_candidates_R2 = parse_synthetic_response(model_response)


# STEP 5: Round 2 - Validation
# ----------------------------------------------------------------------------
For each candidate in synthetic_candidates_R2:
    # Apply validation checks (if any)
    if passes_validation(candidate):
        validation_queue_R2.append(candidate)

# Manual Validation (by 2 HIPAA-trained reviewers)
For each candidate in validation_queue_R2:
    reviewer1_decision = manual_validate(candidate)
    reviewer2_decision = manual_validate(candidate)
    
    if reviewer1_decision == "pass" and reviewer2_decision == "pass":
        verified_final_R2.append(candidate)
        candidate["id"] = f"{category_name}_syn_R2_{len(verified_final_R2)}"
        candidate["round"] = 2
        candidate["category"] = category_name


# STEP 6: Output Format
# ----------------------------------------------------------------------------
# Combine all verified synthetic instances from both rounds
final_synthetic_data = verified_seeds_R1 + verified_final_R2

# Output: JSON file containing only synthetic items
#   - Each synthetic item is a complete Alpaca-format instance
#   - Only manually validated instances are included
#   - Original Stage 2 items are NOT included in this output

save_to_json(final_synthetic_data, output_file="stage3_synthetic_data.json")


# ============================================================================
# PROMPT TEMPLATE FUNCTIONS
# ============================================================================

def build_synthetic_prompt(category_name, category_definition, demonstrations, num_instances):
    """
    Build prompt following Table 3 format.
    """
    system_prompt = """
    You are generating de-identified, clinically plausible home-health-care (HHC) instances, 
    each consisting of a note and its corresponding structured output span. Follow the rules below: 
    
    1. Write a HHC note as if authored by a clinician. Learn from the style of the demonstrations 
       provided and allow occasional typing or syntax errors typical of real notes. 
    2. Maintain accurate clinical meaning and realistic context typical of home settings. 
    3. Keep all content absent of personal identifiers. 
    4. Include the indicator text span verbatim within the note. 
    5. Output the JSON extraction exactly as: {"<CATEGORY_NAME>": ["<indicator text span>"]} 
    6. Do not repeat prior instances; create new variations consistent with the definition of the category.
    """
    
    user_prompt = f"""
    TARGET CATEGORY: {category_name}
    
    TASK OVERVIEW:
    1. Generate Input: Create a plausible clinical note snippet (input) that realistically 
       incorporates information relevant to the TARGET CATEGORY: {category_name}.
    2. Generate Output: Based only on the input snippet you just generated, create the 
       corresponding output JSON string. This JSON maps the category name "{category_name}" 
       to a list of verbatim text spans extracted directly from your generated input. 
    
    KEY GUIDELINES FOR {category_name}:
    {category_definition}
    
    Below are validated real or paraphrased instances illustrating how this category appears 
    in home-health-care notes. Use these demonstrations to understand tone, structure, and 
    JSON output format. Strictly follow the Input/Output format demonstrated below.
    """
    
    # Add demonstrations
    for i, demo in enumerate(demonstrations, 1):
        user_prompt += f"""
    Example {i}
    Input: "{demo['input']}"
    Output: {demo['output']}
    """
    
    user_prompt += f"""
    Now generate {num_instances} entirely new synthetic instances following the same format. 
    
    GENERATE THE NEXT EXAMPLE:
    """
    
    return {
        "system": system_prompt,
        "user": user_prompt
    }