In [1]:
from Levenshtein import distance
import os

def parse_prompt_sets(file_path):
    """Parse the text file into prompt sets"""
    with open(file_path, 'r') as file:
        text = file.read()
    
    # Split text into prompt sets
    prompt_sets = text.split("Prompt Set")[1:]  # Skip empty first split
    
    parsed_sets = []
    for prompt_set in prompt_sets:
        lines = prompt_set.strip().split('\n')
        set_num = int(lines[0].split(':')[0])
        
        # Get original prompt
        original = lines[1].split(': ')[1]
        
        # Get variations
        variations = []
        for line in lines[2:]:
            if line.startswith('Variation'):
                variations.append(line.split(': ')[1])
                
        parsed_sets.append({
            'set_num': set_num,
            'original': original,
            'variations': variations
        })
        
    return parsed_sets

def calculate_edit_distances(prompt_sets, file_name):
    """Calculate edit distances between original and variations"""
    results = []
    
    print(f"\nFile: {file_name}")
    print("=" * 80)
    
    for prompt_set in prompt_sets:
        set_num = prompt_set['set_num']
        original = prompt_set['original']
        
        print(f"\nPrompt Set {set_num}:")
        print(f"Original: {original}")
        print("\nEdit distances:")
        
        set_distances = []  # Store distances for this set
        
        for i, variation in enumerate(prompt_set['variations'], 1):
            edit_dist = distance(original, variation)
            results.append({
                'file_name': file_name,
                'set_num': set_num,
                'variation_num': i,
                'edit_distance': edit_dist,
                'original': original,
                'variation': variation
            })
            set_distances.append(edit_dist)
            print(f"Variation {i}: {edit_dist} changes")
            print(f"Text: {variation}")
            
        # Print statistics for this set
        avg_dist = sum(set_distances) / len(set_distances)
        print(f"\nSet {set_num} Statistics:")
        print(f"Average edit distance: {avg_dist:.2f}")
        print(f"Maximum edit distance: {max(set_distances)}")
        print(f"Minimum edit distance: {min(set_distances)}")
        print("-" * 80)
            
    return results

def process_directory(directory_path):
    """Process all text files in the directory"""
    all_results = []
    
    # Process each file in the directory
    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            prompt_sets = parse_prompt_sets(file_path)
            results = calculate_edit_distances(prompt_sets, filename)
            all_results.extend(results)
            
            # Calculate per-file statistics
            distances = [r['edit_distance'] for r in results]
            print(f"\nSummary Statistics for {filename}:")
            print(f"Average edit distance: {sum(distances)/len(distances):.2f}")
            print(f"Maximum edit distance: {max(distances)}")
            print(f"Minimum edit distance: {min(distances)}")
            print("=" * 80)
    
    # Calculate overall statistics
    print("\nOverall Summary Statistics:")
    all_distances = [r['edit_distance'] for r in all_results]
    print(f"Overall average edit distance: {sum(all_distances)/len(all_distances):.2f}")
    print(f"Overall maximum edit distance: {max(all_distances)}")
    print(f"Overall minimum edit distance: {min(all_distances)}")
    
    return all_results

# Directory path
directory_path = "/share/ssddata/sarimhashmi/iuxray_posix_prompts/spell_error"  # Replace with your directory path

# Process all files
results = process_directory(directory_path)


File: variations_batch_1.txt

Prompt Set 1:
Original: Is there evidence of focal consolidation in either lung?

Edit distances:
Variation 1: 3 changes
Text: Is there vidence of focal conolidation ib either lung?
Variation 2: 3 changes
Text: Is thee evidence or focal consolidation in either lujng?
Variation 3: 4 changes
Text: sI there evlidence of focal consolidation in eiither lung?
Variation 4: 3 changes
Text: sI there evidence of focal consolidation in either lungx?
Variation 5: 3 changes
Text: Ix there evidence o foal consolidation in either lung?
Variation 6: 2 changes
Text: Is here evidence of focal consolidation i either lung?
Variation 7: 1 changes
Text: Is there evidence of focal consolidation in either ung?
Variation 8: 1 changes
Text: Ihs there evidence of focal consolidation in either lung?
Variation 9: 2 changes
Text: Is there vidence of fodal consolidation in either lung?

Set 1 Statistics:
Average edit distance: 2.44
Maximum edit distance: 4
Minimum edit distance: 1
----