In [1]:
import json
import os
import random
from typing import List, Dict
import string
class SpellingErrorGenerator:
    def __init__(self, seed: int = None):
        """Initialize the spelling error generator with an optional seed."""
        if seed is not None:
            random.seed(seed)
        
        # Define keyboard layout for substitution errors
        self.keyboard_adjacency = {
            'q': ['w', 'a'], 'w': ['q', 'e', 's'], 'e': ['w', 'r', 'd'], 'r': ['e', 't', 'f'],
            't': ['r', 'y', 'g'], 'y': ['t', 'u', 'h'], 'u': ['y', 'i', 'j'], 'i': ['u', 'o', 'k'],
            'o': ['i', 'p', 'l'], 'p': ['o'],
            'a': ['q', 's', 'z'], 's': ['w', 'a', 'd', 'x'], 'd': ['e', 's', 'f', 'c'],
            'f': ['r', 'd', 'g', 'v'], 'g': ['t', 'f', 'h', 'b'], 'h': ['y', 'g', 'j', 'n'],
            'j': ['u', 'h', 'k', 'm'], 'k': ['i', 'j', 'l'], 'l': ['o', 'k'],
            'z': ['a', 'x'], 'x': ['s', 'z', 'c'], 'c': ['d', 'x', 'v'],
            'v': ['f', 'c', 'b'], 'b': ['g', 'v', 'n'], 'n': ['h', 'b', 'm'],
            'm': ['j', 'n']
        }

    def _insertion_error(self, word: str) -> str:
        """Insert a random letter within the word."""
        if not word:
            return word
        pos = random.randint(0, len(word))
        letter = random.choice(string.ascii_lowercase)
        return word[:pos] + letter + word[pos:]

    def _omission_error(self, word: str) -> str:
        """Delete a letter at a random position."""
        if len(word) <= 1:
            return word
        pos = random.randint(0, len(word) - 1)
        return word[:pos] + word[pos + 1:]

    def _transposition_error(self, word: str) -> str:
        """Swap two adjacent letters."""
        if len(word) <= 1:
            return word
        pos = random.randint(0, len(word) - 2)
        return word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:]

    def _substitution_error(self, word: str) -> str:
        """Replace a letter with an adjacent letter on the keyboard."""
        if not word:
            return word
        pos = random.randint(0, len(word) - 1)
        char = word[pos].lower()
        if char in self.keyboard_adjacency:
            replacement = random.choice(self.keyboard_adjacency[char])
            # Preserve original case
            if word[pos].isupper():
                replacement = replacement.upper()
            return word[:pos] + replacement + word[pos + 1:]
        return word

    def apply_error(self, word: str) -> str:
        """Apply a random type of spelling error to the word."""
        error_functions = [
            self._insertion_error,
            self._omission_error,
            self._transposition_error,
            self._substitution_error
        ]
        error_function = random.choice(error_functions)
        return error_function(word)

    def generate_variants(self, text: str, num_variants: int = 20) -> List[str]:
        """
        Generate multiple variants of the input text with spelling errors.
        
        Args:
            text: Input text to generate variants for
            num_variants: Number of variants to generate (default: 20 as per paper)
            
        Returns:
            List of text variants with spelling errors
        """
        words = text.split()
        variants = []
        
        # Number of tokens to modify in each variant
        tokens_to_modify = [1, 2, 4, 8]
        
        for _ in range(num_variants):
            variant_words = words.copy()
            num_tokens = min(random.choice(tokens_to_modify), len(words))
            
            # Select random positions to modify
            positions = random.sample(range(len(words)), num_tokens)
            
            # Apply random errors to selected positions
            for pos in positions:
                variant_words[pos] = self.apply_error(variant_words[pos])
            
            variants.append(' '.join(variant_words))
        
        return variants

def clean_question(question: str) -> str:
    """Remove the choice prompt and image tag from the question."""
    # Remove the choice prompt and image tag
    parts = question.split(" Please choose")
    question_text = parts[0]
    return question_text

def get_choice_prompt(question: str) -> str:
    """Extract the choice prompt without the image tag."""
    if " Please choose" in question:
        parts = question.split(" Please choose")
        prompt = "Please choose" + parts[1].split("\n<image>")[0]
        return prompt
    return "Please choose from the following two options: [yes, no]"

def add_format(question: str, prompt: str) -> str:
    """Add back the choice prompt and image tag."""
    return f"{question} {prompt}\n<image>"

def process_json_file(input_file: str, output_dir: str, variants_per_question: int = 10):
    """Process the JSON file and create variants for each question."""
    # Create spelling error generator
    generator = SpellingErrorGenerator(seed=42)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Read the input file
    with open(input_file, 'r') as f:
        lines = f.readlines()
    
    # Process each line (each JSON object)
    for i, line in enumerate(lines):
        data = json.loads(line)
        
        # Get the original question and prompt without the image tag
        original_question = clean_question(data['question'])
        original_prompt = get_choice_prompt(data['question'])
        
        # Create base output dictionary
        output_data = {
            "question": data['question'],
            "answer": data['answer'],
            "image": data['image']
        }
        
        # Generate variants
        for j in range(variants_per_question):
            # Randomly decide whether to modify question, prompt, or both
            modify_type = generator.apply_error if random.random() < 0.7 else lambda x: x
            
            # Generate variant for question and prompt
            variant_question = generator.apply_error(original_question)
            variant_prompt = modify_type(original_prompt)
            
            # Combine with format
            full_variant = add_format(variant_question, variant_prompt)
            
            # Add variant to output data
            output_data[f"variation_{j+1}"] = full_variant
        
        # Write to output file
        output_file = os.path.join(output_dir, f"question_{i+1}_variants.json")
        with open(output_file, 'w') as f:
            json.dump(output_data, f, indent=2)

def main():
    # Input and output paths
    input_file = "/ephemeral/shashmi/posix_new_improved/Thesis/vanillah_iuxray_json.json"  # Change this to your input file path
    output_dir = "spell_error_question_variants"
    
    # Process the file
    process_json_file(input_file, output_dir)
    print(f"Generated variants have been saved to the '{output_dir}' directory.")

if __name__ == "__main__":
    main()

Generated variants have been saved to the 'spell_error_question_variants' directory.
