In [1]:
import random
from typing import List, Tuple
import string

class SpellingErrorGenerator:
    def __init__(self, seed: int = None):
        """Initialize the spelling error generator with an optional seed."""
        if seed is not None:
            random.seed(seed)
        
        # Define keyboard layout for substitution errors
        self.keyboard_adjacency = {
            'q': ['w', 'a'], 'w': ['q', 'e', 's'], 'e': ['w', 'r', 'd'], 'r': ['e', 't', 'f'],
            't': ['r', 'y', 'g'], 'y': ['t', 'u', 'h'], 'u': ['y', 'i', 'j'], 'i': ['u', 'o', 'k'],
            'o': ['i', 'p', 'l'], 'p': ['o'],
            'a': ['q', 's', 'z'], 's': ['w', 'a', 'd', 'x'], 'd': ['e', 's', 'f', 'c'],
            'f': ['r', 'd', 'g', 'v'], 'g': ['t', 'f', 'h', 'b'], 'h': ['y', 'g', 'j', 'n'],
            'j': ['u', 'h', 'k', 'm'], 'k': ['i', 'j', 'l'], 'l': ['o', 'k'],
            'z': ['a', 'x'], 'x': ['s', 'z', 'c'], 'c': ['d', 'x', 'v'],
            'v': ['f', 'c', 'b'], 'b': ['g', 'v', 'n'], 'n': ['h', 'b', 'm'],
            'm': ['j', 'n']
        }

    def _insertion_error(self, word: str) -> str:
        """Insert a random letter within the word."""
        if not word:
            return word
        pos = random.randint(0, len(word))
        letter = random.choice(string.ascii_lowercase)
        return word[:pos] + letter + word[pos:]

    def _omission_error(self, word: str) -> str:
        """Delete a letter at a random position."""
        if len(word) <= 1:
            return word
        pos = random.randint(0, len(word) - 1)
        return word[:pos] + word[pos + 1:]

    def _transposition_error(self, word: str) -> str:
        """Swap two adjacent letters."""
        if len(word) <= 1:
            return word
        pos = random.randint(0, len(word) - 2)
        return word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:]

    def _substitution_error(self, word: str) -> str:
        """Replace a letter with an adjacent letter on the keyboard."""
        if not word:
            return word
        pos = random.randint(0, len(word) - 1)
        char = word[pos].lower()
        if char in self.keyboard_adjacency:
            replacement = random.choice(self.keyboard_adjacency[char])
            # Preserve original case
            if word[pos].isupper():
                replacement = replacement.upper()
            return word[:pos] + replacement + word[pos + 1:]
        return word

    def apply_error(self, word: str) -> str:
        """Apply a random type of spelling error to the word."""
        error_functions = [
            self._insertion_error,
            self._omission_error,
            self._transposition_error,
            self._substitution_error
        ]
        error_function = random.choice(error_functions)
        return error_function(word)

    def generate_variants(self, text: str, num_variants: int = 20) -> List[str]:
        """
        Generate multiple variants of the input text with spelling errors.
        
        Args:
            text: Input text to generate variants for
            num_variants: Number of variants to generate (default: 20 as per paper)
            
        Returns:
            List of text variants with spelling errors
        """
        words = text.split()
        variants = []
        
        # Number of tokens to modify in each variant
        tokens_to_modify = [1, 2, 4, 8]
        
        for _ in range(num_variants):
            variant_words = words.copy()
            num_tokens = min(random.choice(tokens_to_modify), len(words))
            
            # Select random positions to modify
            positions = random.sample(range(len(words)), num_tokens)
            
            # Apply random errors to selected positions
            for pos in positions:
                variant_words[pos] = self.apply_error(variant_words[pos])
            
            variants.append(' '.join(variant_words))
        
        return variants

def main():
    # Example usage
    generator = SpellingErrorGenerator(seed=42)
    
    # Test text
    text = "The quick brown fox jumps over the lazy dog"
    
    print("Original text:", text)
    print("\nVariants with spelling errors:")
    variants = generator.generate_variants(text, num_variants=5)
    for i, variant in enumerate(variants, 1):
        print(f"{i}. {variant}")

if __name__ == "__main__":
    main()

Original text: The quick brown fox jumps over the lazy dog

Variants with spelling errors:
1. hTe quick brown fox jumps over the lazy dog
2. The cquick brownn fox jumps over the lazy dog
3. Yhe quick brown fo jumps over the lazy dog
4. Teh quick bown foxd jumps voer the lazy dog
5. vThe quick brown fox jumpjs pver hte lazy dog


## Lets go over all the files 

In [2]:
import os
from typing import List
import random
from pathlib import Path

class SpellingErrorGenerator:
    def __init__(self, seed: int = None):
        """Initialize the spelling error generator with an optional seed."""
        if seed is not None:
            random.seed(seed)
        
        # Define keyboard layout for substitution errors
        self.keyboard_adjacency = {
            'q': ['w', 'a'], 'w': ['q', 'e', 's'], 'e': ['w', 'r', 'd'], 'r': ['e', 't', 'f'],
            't': ['r', 'y', 'g'], 'y': ['t', 'u', 'h'], 'u': ['y', 'i', 'j'], 'i': ['u', 'o', 'k'],
            'o': ['i', 'p', 'l'], 'p': ['o'],
            'a': ['q', 's', 'z'], 's': ['w', 'a', 'd', 'x'], 'd': ['e', 's', 'f', 'c'],
            'f': ['r', 'd', 'g', 'v'], 'g': ['t', 'f', 'h', 'b'], 'h': ['y', 'g', 'j', 'n'],
            'j': ['u', 'h', 'k', 'm'], 'k': ['i', 'j', 'l'], 'l': ['o', 'k'],
            'z': ['a', 'x'], 'x': ['s', 'z', 'c'], 'c': ['d', 'x', 'v'],
            'v': ['f', 'c', 'b'], 'b': ['g', 'v', 'n'], 'n': ['h', 'b', 'm'],
            'm': ['j', 'n']
        }

    def _insertion_error(self, word: str) -> str:
        """Insert a random letter within the word."""
        if not word:
            return word
        pos = random.randint(0, len(word))
        letter = random.choice('abcdefghijklmnopqrstuvwxyz')
        return word[:pos] + letter + word[pos:]

    def _omission_error(self, word: str) -> str:
        """Delete a letter at a random position."""
        if len(word) <= 1:
            return word
        pos = random.randint(0, len(word) - 1)
        return word[:pos] + word[pos + 1:]

    def _transposition_error(self, word: str) -> str:
        """Swap two adjacent letters."""
        if len(word) <= 1:
            return word
        pos = random.randint(0, len(word) - 2)
        return word[:pos] + word[pos + 1] + word[pos] + word[pos + 2:]

    def _substitution_error(self, word: str) -> str:
        """Replace a letter with an adjacent letter on the keyboard."""
        if not word:
            return word
        pos = random.randint(0, len(word) - 1)
        char = word[pos].lower()
        if char in self.keyboard_adjacency:
            replacement = random.choice(self.keyboard_adjacency[char])
            if word[pos].isupper():
                replacement = replacement.upper()
            return word[:pos] + replacement + word[pos + 1:]
        return word

    def apply_error(self, word: str) -> str:
        """Apply a random type of spelling error to the word."""
        error_functions = [
            self._insertion_error,
            self._omission_error,
            self._transposition_error,
            self._substitution_error
        ]
        return random.choice(error_functions)(word)

def process_batch_file(file_path: str, generator: SpellingErrorGenerator) -> List[str]:
    """Process a single batch file and return prompts with variations."""
    with open(file_path, 'r') as f:
        lines = [line.strip() for line in f.readlines() if line.strip()]
    
    result = []
    for i, line in enumerate(lines, 1):
        # Remove the leading number and period if present
        original = line.split('. ', 1)[-1].strip()
        
        # Create prompt set header
        result.append(f"Prompt Set {i}:")
        result.append(f"Original: {original}")
        
        # Generate 9 variations
        for j in range(1, 10):
            # Split into words and introduce errors
            words = original.split()
            # Randomly select 1-3 words to modify
            num_words_to_modify = random.randint(1, 3)
            positions = random.sample(range(len(words)), num_words_to_modify)
            
            for pos in positions:
                words[pos] = generator.apply_error(words[pos])
            
            variation = ' '.join(words)
            result.append(f"Variation {j}: {variation}")
        
        # Add blank line between prompt sets
        result.append("")
    
    return result

def convert_batch_files(input_dir: str, output_dir: str):
    """Convert all batch files in the input directory and save to output directory."""
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Initialize spelling error generator
    generator = SpellingErrorGenerator(seed=42)
    
    # Process each batch file
    for filename in os.listdir(input_dir):
        if filename.startswith('batch_') and filename.endswith('.txt'):
            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, f"variations_{filename}")
            
            # Process the file and write results
            variations = process_batch_file(input_path, generator)
            with open(output_path, 'w') as f:
                f.write('\n'.join(variations))
            
            print(f"Processed {filename}")

if __name__ == "__main__":
    # Example usage
    input_directory = "/share/ssddata/sarimhashmi/iuxray_posix_prompts/text_batches"
    output_directory = "/share/ssddata/sarimhashmi/iuxray_posix_prompts/spell_error"
    
    convert_batch_files(input_directory, output_directory)

Processed batch_47.txt
Processed batch_36.txt
Processed batch_31.txt
Processed batch_6.txt
Processed batch_78.txt
Processed batch_53.txt
Processed batch_25.txt
Processed batch_19.txt
Processed batch_33.txt
Processed batch_35.txt
Processed batch_3.txt
Processed batch_18.txt
Processed batch_2.txt
Processed batch_13.txt
Processed batch_79.txt
Processed batch_51.txt
Processed batch_46.txt
Processed batch_58.txt
Processed batch_56.txt
Processed batch_38.txt
Processed batch_4.txt
Processed batch_29.txt
Processed batch_80.txt
Processed batch_23.txt
Processed batch_72.txt
Processed batch_26.txt
Processed batch_17.txt
Processed batch_7.txt
Processed batch_30.txt
Processed batch_32.txt
Processed batch_55.txt
Processed batch_8.txt
Processed batch_16.txt
Processed batch_74.txt
Processed batch_34.txt
Processed batch_40.txt
Processed batch_43.txt
Processed batch_42.txt
Processed batch_67.txt
Processed batch_59.txt
Processed batch_28.txt
Processed batch_20.txt
Processed batch_64.txt
Processed batch_6