In [11]:
import json
import os
from pathlib import Path

def extract_titles_from_json_files(directory_path):
    """Extract all unique titles from JSON files in the specified directory."""
    
    directory = Path(directory_path)
    all_titles = set()
    file_titles = {}
    
    print(f"Scanning directory: {directory_path}")
    print("=" * 60)
    
    # Get all JSON files in the directory
    json_files = list(directory.glob("*.json"))
    
    if not json_files:
        print("No JSON files found in the directory!")
        return
    
    print(f"Found {len(json_files)} JSON file(s):")
    
    for json_file in json_files:
        print(f"\nProcessing: {json_file.name}")
        file_titles[json_file.name] = []
        
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            titles_in_file = []
            
            # Handle different JSON structures
            if "stories" in data:
                # New structure with "stories"
                for story in data["stories"]:
                    title = story.get("title", "")
                    if title:
                        titles_in_file.append(title)
                        all_titles.add(title)
                        
            elif "texts" in data:
                # Old structure with "texts"
                for text in data["texts"]:
                    title = text.get("title", "")
                    if title:
                        titles_in_file.append(title)
                        all_titles.add(title)
            else:
                # Check if it's a direct array or other structure
                if isinstance(data, list):
                    for item in data:
                        if isinstance(item, dict) and "title" in item:
                            title = item["title"]
                            titles_in_file.append(title)
                            all_titles.add(title)
                elif isinstance(data, dict):
                    # Look for any key that might contain stories/texts
                    for key, value in data.items():
                        if isinstance(value, list):
                            for item in value:
                                if isinstance(item, dict) and "title" in item:
                                    title = item["title"]
                                    titles_in_file.append(title)
                                    all_titles.add(title)
            
            file_titles[json_file.name] = titles_in_file
            print(f"  Found {len(titles_in_file)} titles:")
            for i, title in enumerate(titles_in_file, 1):
                print(f"    {i}. {title}")
                
        except json.JSONDecodeError as e:
            print(f"  Error reading JSON: {e}")
        except Exception as e:
            print(f"  Unexpected error: {e}")
    
    # Summary
    print(f"\n{'='*60}")
    print(f"SUMMARY")
    print(f"{'='*60}")
    print(f"Total unique titles across all files: {len(all_titles)}")
    
    print(f"\nAll unique titles (alphabetically sorted):")
    for i, title in enumerate(sorted(all_titles), 1):
        print(f"{i:2d}. {title}")
    
    # Show which files contain each title
    print(f"\nTitle distribution across files:")
    for title in sorted(all_titles):
        files_with_title = [filename for filename, titles in file_titles.items() if title in titles]
        print(f"'{title}' appears in: {', '.join(files_with_title)}")
    
    return all_titles, file_titles

# Run the analysis
directory_path = "/Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers"
unique_titles, file_breakdown = extract_titles_from_json_files(directory_path)

Scanning directory: /Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers
Found 4 JSON file(s):

Processing: seconda-elementare_with_answers.json
  Found 14 titles:
    1. una sciarpa per la scuola
    2. giovanni l’inventore
    3. una bella sorpresa
    4. un fantasma a pallini
    5. il mago dormiglione
    6. l'ora di punta
    7. tutti in gita!
    8. troppo bianchi
    9. zanzare
    10. dov'è biancaneve?
    11. un vento bellissimo
    12. un libro divertente
    13. dino resta a casa
    14. l'orologio

Processing: quinta-elementare_with_answers.json
  Found 17 titles:
    1. tre volte bau
    2. il gatto osiride
    3. verso i mari del sud
    4. mi piaci, sai
    5. la testimone oculare
    6. il lavoro di mio padre
    7. davanti al portone
    8. la scomparsa del lunedì
    9. i doni di mio padre
    10. vita in famiglia
    11. destinazione norvegia
    12. il gigante dahl
    13. la donna che volò sugli oceani
    14. henri matisse
    15. il conte dracu

In [12]:
# Add this code to analyze the second directory
print("\n" + "="*80)
print("ANALYZING SECOND DIRECTORY")
print("="*80)

# Run the analysis on the second directory
directory_path_2 = "/Users/Martina.Galletti/Downloads/dati-artis1/dati-json"
unique_titles_2, file_breakdown_2 = extract_titles_from_json_files(directory_path_2)

# Compare the two directories
print("\n" + "="*80)
print("COMPARISON BETWEEN DIRECTORIES")
print("="*80)

print(f"Golden answers directory (/json-with-correct-answers): {len(unique_titles)} unique titles")
print(f"User data directory (/dati-json): {len(unique_titles_2)} unique titles")

# Find common titles
common_titles = unique_titles.intersection(unique_titles_2)
only_in_golden = unique_titles - unique_titles_2
only_in_user = unique_titles_2 - unique_titles

print(f"\nCommon titles between both directories: {len(common_titles)}")
for i, title in enumerate(sorted(common_titles), 1):
    print(f"  {i}. {title}")

print(f"\nTitles ONLY in golden answers directory: {len(only_in_golden)}")
for i, title in enumerate(sorted(only_in_golden), 1):
    print(f"  {i}. {title}")

print(f"\nTitles ONLY in user data directory: {len(only_in_user)}")
for i, title in enumerate(sorted(only_in_user), 1):
    print(f"  {i}. {title}")

# Title matching analysis
print(f"\nTITLE MATCHING ANALYSIS:")
print(f"Match rate: {len(common_titles)}/{max(len(unique_titles), len(unique_titles_2))} = {(len(common_titles)/max(len(unique_titles), len(unique_titles_2)))*100:.1f}%")


ANALYZING SECOND DIRECTORY
Scanning directory: /Users/Martina.Galletti/Downloads/dati-artis1/dati-json
Found 12 JSON file(s):

Processing: risultati-user12.json
  Found 9 titles:
    1. Una Bella Sorpresa
    2. Un Fantasma a Pallini
    3. Zanzare
    4. Tutti in Gita!
    5. Troppo Bianchi
    6. Un Vento Bellissimo
    7. L'Orologio
    8. Un Libro Divertente
    9. Dino resta a casa

Processing: risultati-user24.json
  Found 10 titles:
    1. il gatto osiride
    2. verso i mari del sud
    3. il lavoro di mio padre
    4. davanti al portone
    5. la scomparsa del lunedì
    6. i doni di mio padre
    7. vita in famiglia
    8. destinazione norvegia
    9. henri matisse
    10. il conte dracula

Processing: risultati-user1.json
  Found 12 titles:
    1. tre volte bau
    2. il gatto osiride
    3. verso i mari del sud
    4. la testimone oculare
    5. il lavoro di mio padre
    6. davanti al portone
    7. la scomparsa del lunedì
    8. i doni di mio padre
    9. vita in famigli

Still unmatched in golden answers:
  1. come la notte scese sul mondo
  2. david barbagrigia
  3. era una notte buia...
  4. festa di halloween
  5. giovanni l’inventore
  6. incontro col mammut
  7. la donna che volò sugli oceani
  8. la figlia dell'aria e le uova dell'aquila
  9. la stanza dello scrittoio
  10. la strega settimia
  11. l’orologio
  12. l’ultima estate, berlino 1961
  13. l’uomo con la barba
  14. medicina o veleno?
  15. mi piaci, sai
  16. tre sirene e un pirata
  17. troppo bianchi!
  18. un nuovo amico... anzi due

Still unmatched in user data:
  1. david barbarigia
  2. incontro con il mammut
  3. l'orologio
  4. la festa di halloween
  5. la figlia dell'aria e dell'aquila
  6. troppo bianchi

In [13]:
def normalize_title(title):
    """Normalize title to lowercase and clean whitespace."""
    return title.strip().lower()

def create_title_mapping():
    """Create manual title mapping between golden answers and user data."""
    
    # Based on the comparison, create mappings from golden -> user format
    title_mapping = {
        # Add mappings here based on your comparison results
        # Format: "Golden Answer Title": "User Data Title"
        
        # Example mappings (you'll need to adjust based on your actual output):
        "Il dono di mio padre": "I Doni di mio Padre",
        "La testimone oculare": "La Testimone Oculare",
        "Il ritardatario": "Il Ritardatario",
        "Davanti al portone": "Davanti al Portone",
        "Il gatto Osiride": "Il Gatto Osiride",
        "La città di ghiaccio": "La Città di Ghiaccio",
        "A che cosa servono i denti?": "A cosa servono i denti?",
        "Il gigante Dahl": "Il Gigante Dahl",
        "Il lavoro di mio padre": "Il Lavoro di Mio Padre",
        "Il conte Dracula": "Il Conte Dracula",
        "Vita in famiglia": "Vita in Famiglia",
        "Il dono di mio padre": "I Doni di mio Padre",
        "Il cane poeta": "Il Cane Poeta",
        "La scomparsa del lunedì": "La Scomparsa del Lunedì",

    }
    
    return title_mapping

def normalize_titles_to_lowercase():
    """Normalize all titles in golden answers files to lowercase."""
    
    # Path to the golden answers directory
    golden_dir = Path("/Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers")
    
    print("NORMALIZING TITLES TO LOWERCASE IN GOLDEN ANSWERS FILES")
    print("=" * 60)
    
    for json_file in golden_dir.glob("*.json"):
        print(f"\nProcessing: {json_file.name}")
        
        try:
            # Load the JSON file
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            changes_made = 0
            
            # Handle different JSON structures
            if "stories" in data:
                for story in data["stories"]:
                    original_title = story.get("title", "")
                    if original_title:
                        normalized_title = normalize_title(original_title)
                        if original_title != normalized_title:
                            story["title"] = normalized_title
                            print(f"  Changed: '{original_title}' -> '{normalized_title}'")
                            changes_made += 1
                        
            elif "texts" in data:
                for text in data["texts"]:
                    original_title = text.get("title", "")
                    if original_title:
                        normalized_title = normalize_title(original_title)
                        if original_title != normalized_title:
                            text["title"] = normalized_title
                            print(f"  Changed: '{original_title}' -> '{normalized_title}'")
                            changes_made += 1
            
            # Save the modified file if changes were made
            if changes_made > 0:
                # Create backup first
                backup_file = json_file.with_suffix('.json.backup')
                if not backup_file.exists():
                    # Load original data for backup
                    with open(json_file, 'r', encoding='utf-8') as f:
                        original_data = json.load(f)
                    with open(backup_file, 'w', encoding='utf-8') as f:
                        json.dump(original_data, f, ensure_ascii=False, indent=2)
                    print(f"  Backup created: {backup_file.name}")
                
                # Save the normalized file
                with open(json_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)
                print(f"  File updated with {changes_made} changes")
            else:
                print(f"  No changes needed")
                
        except Exception as e:
            print(f"  Error processing {json_file.name}: {e}")
    
    print(f"\nLowercase normalization complete!")

def normalize_user_data_titles():
    """Normalize all titles in user data files to lowercase."""
    
    # Path to the user data directory
    user_dir = Path("/Users/Martina.Galletti/Downloads/dati-artis1/dati-json")
    
    print("NORMALIZING TITLES TO LOWERCASE IN USER DATA FILES")
    print("=" * 55)
    
    for json_file in user_dir.glob("*.json"):
        print(f"\nProcessing: {json_file.name}")
        
        try:
            # Load the JSON file
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            changes_made = 0
            
            # Handle different JSON structures
            if "stories" in data:
                for story in data["stories"]:
                    original_title = story.get("title", "")
                    if original_title:
                        normalized_title = normalize_title(original_title)
                        if original_title != normalized_title:
                            story["title"] = normalized_title
                            print(f"  Changed: '{original_title}' -> '{normalized_title}'")
                            changes_made += 1
                        
            elif "texts" in data:
                for text in data["texts"]:
                    original_title = text.get("title", "")
                    if original_title:
                        normalized_title = normalize_title(original_title)
                        if original_title != normalized_title:
                            text["title"] = normalized_title
                            print(f"  Changed: '{original_title}' -> '{normalized_title}'")
                            changes_made += 1
            
            # Save the modified file if changes were made
            if changes_made > 0:
                # Create backup first
                backup_file = json_file.with_suffix('.json.backup')
                if not backup_file.exists():
                    # Load original data for backup
                    with open(json_file, 'r', encoding='utf-8') as f:
                        original_data = json.load(f)
                    with open(backup_file, 'w', encoding='utf-8') as f:
                        json.dump(original_data, f, ensure_ascii=False, indent=2)
                    print(f"  Backup created: {backup_file.name}")
                
                # Save the normalized file
                with open(json_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)
                print(f"  File updated with {changes_made} changes")
            else:
                print(f"  No changes needed")
                
        except Exception as e:
            print(f"  Error processing {json_file.name}: {e}")
    
    print(f"\nUser data lowercase normalization complete!")

# Execute lowercase normalization for both directories
print("STEP 1: NORMALIZE ALL TITLES TO LOWERCASE")
print("="*50)

# Normalize golden answers
normalize_titles_to_lowercase()

print("\n" + "="*50)

# Normalize user data
normalize_user_data_titles()

print("\n" + "="*50)
print("STEP 2: RE-ANALYZE AFTER LOWERCASE NORMALIZATION")
print("="*50)

# Re-run the analysis after lowercase normalization
unique_titles_normalized, _ = extract_titles_from_json_files("/Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers")
unique_titles_2_normalized, _ = extract_titles_from_json_files("/Users/Martina.Galletti/Downloads/dati-artis1/dati-json")

# Compare after normalization
common_normalized = unique_titles_normalized.intersection(unique_titles_2_normalized)
only_in_golden_normalized = unique_titles_normalized - unique_titles_2_normalized
only_in_user_normalized = unique_titles_2_normalized - unique_titles_normalized

print(f"\nResults after lowercase normalization:")
print(f"Golden answers titles: {len(unique_titles_normalized)}")
print(f"User data titles: {len(unique_titles_2_normalized)}")
print(f"Common titles: {len(common_normalized)}")
print(f"Match rate: {len(common_normalized)}/{max(len(unique_titles_normalized), len(unique_titles_2_normalized))} = {(len(common_normalized)/max(len(unique_titles_normalized), len(unique_titles_2_normalized)))*100:.1f}%")

print(f"\nMatching titles after normalization:")
for i, title in enumerate(sorted(common_normalized), 1):
    print(f"  {i}. {title}")

if only_in_golden_normalized:
    print(f"\nStill unmatched in golden answers:")
    for i, title in enumerate(sorted(only_in_golden_normalized), 1):
        print(f"  {i}. {title}")

if only_in_user_normalized:
    print(f"\nStill unmatched in user data:")
    for i, title in enumerate(sorted(only_in_user_normalized), 1):
        print(f"  {i}. {title}")

STEP 1: NORMALIZE ALL TITLES TO LOWERCASE
NORMALIZING TITLES TO LOWERCASE IN GOLDEN ANSWERS FILES

Processing: seconda-elementare_with_answers.json
  No changes needed

Processing: quinta-elementare_with_answers.json
  No changes needed

Processing: terza-elementare_with_answers.json
  No changes needed

Processing: quarta-elementare_with_answers.json
  No changes needed

Lowercase normalization complete!

NORMALIZING TITLES TO LOWERCASE IN USER DATA FILES

Processing: risultati-user12.json
  Changed: 'Una Bella Sorpresa' -> 'una bella sorpresa'
  Changed: 'Un Fantasma a Pallini' -> 'un fantasma a pallini'
  Changed: 'Zanzare' -> 'zanzare'
  Changed: 'Tutti in Gita!' -> 'tutti in gita!'
  Changed: 'Troppo Bianchi' -> 'troppo bianchi'
  Changed: 'Un Vento Bellissimo' -> 'un vento bellissimo'
  Changed: 'L'Orologio' -> 'l'orologio'
  Changed: 'Un Libro Divertente' -> 'un libro divertente'
  Changed: 'Dino resta a casa' -> 'dino resta a casa'
  Backup created: risultati-user12.json.backup

In [14]:
def harmonize_json_files():
    """Harmonize titles in the golden answers JSON files."""
    
    # Get the title mapping
    mapping = create_title_mapping()
    
    if not mapping:
        print("No title mappings defined. Please add mappings to create_title_mapping() function.")
        return
    
    # Path to the golden answers directory
    golden_dir = Path("/Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers")
    
    print("HARMONIZING TITLES IN GOLDEN ANSWERS FILES")
    print("=" * 50)
    
    for json_file in golden_dir.glob("*.json"):
        print(f"\nProcessing: {json_file.name}")
        
        try:
            # Load the JSON file
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            changes_made = 0
            
            # Handle different JSON structures
            if "stories" in data:
                for story in data["stories"]:
                    original_title = story.get("title", "")
                    if original_title in mapping:
                        new_title = mapping[original_title]
                        story["title"] = new_title
                        print(f"  Changed: '{original_title}' -> '{new_title}'")
                        changes_made += 1
                        
            elif "texts" in data:
                for text in data["texts"]:
                    original_title = text.get("title", "")
                    if original_title in mapping:
                        new_title = mapping[original_title]
                        text["title"] = new_title
                        print(f"  Changed: '{original_title}' -> '{new_title}'")
                        changes_made += 1
            
            # Save the modified file if changes were made
            if changes_made > 0:
                # Create backup first
                backup_file = json_file.with_suffix('.json.backup2')
                if not backup_file.exists():
                    with open(backup_file, 'w', encoding='utf-8') as f:
                        # Load current state for backup
                        with open(json_file, 'r', encoding='utf-8') as orig:
                            json.dump(json.load(orig), f, ensure_ascii=False, indent=2)
                    print(f"  Backup created: {backup_file.name}")
                
                # Save the harmonized file
                with open(json_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)
                print(f"  File updated with {changes_made} changes")
            else:
                print(f"  No changes needed")
                
        except Exception as e:
            print(f"  Error processing {json_file.name}: {e}")
    
    print(f"\nHarmonization complete!")

# Update the create_title_mapping function with the remaining mismatches
def create_title_mapping_final():
    """Create manual title mapping for the remaining mismatches."""
    
    title_mapping = {
        # Based on the lowercase analysis, map remaining mismatches
        "david barbagrigia": "david barbarigia",
        "incontro col mammut": "incontro con il mammut", 
        "festa di halloween": "la festa di halloween",
        "la figlia dell'aria e le uova dell'aquila": "la figlia dell'aria e dell'aquila",
        "troppo bianchi!": "troppo bianchi",
        "l’orologio":"l'orologio"
    }
    
    return title_mapping

# STEP 3: APPLY FINAL HARMONIZATION
print("\n" + "="*60)
print("STEP 3: APPLY FINAL TITLE HARMONIZATION")
print("="*60)

# Temporarily replace the mapping function
original_create_mapping = create_title_mapping
create_title_mapping = create_title_mapping_final

# Execute the harmonization with the final mappings
harmonize_json_files()

# Restore original function
create_title_mapping = original_create_mapping

# STEP 4: FINAL VERIFICATION
print("\n" + "="*60)
print("STEP 4: FINAL VERIFICATION")
print("="*60)

# Re-run the analysis to see final results
unique_titles_final, _ = extract_titles_from_json_files("/Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers")
common_final = unique_titles_final.intersection(unique_titles_2_normalized)

print(f"\nFINAL RESULTS:")
print(f"Golden answers titles: {len(unique_titles_final)}")
print(f"User data titles: {len(unique_titles_2_normalized)}")
print(f"Common titles: {len(common_final)}")
print(f"Match rate: {len(common_final)}/{max(len(unique_titles_final), len(unique_titles_2_normalized))} = {(len(common_final)/max(len(unique_titles_final), len(unique_titles_2_normalized)))*100:.1f}%")

print(f"\nFinal matching titles:")
for i, title in enumerate(sorted(common_final), 1):
    print(f"  {i}. {title}")

# Check for any remaining unmatched titles
only_in_golden_final = unique_titles_final - unique_titles_2_normalized
only_in_user_final = unique_titles_2_normalized - unique_titles_final

if only_in_golden_final:
    print(f"\nRemaining unmatched in golden answers:")
    for i, title in enumerate(sorted(only_in_golden_final), 1):
        print(f"  {i}. {title}")

if only_in_user_final:
    print(f"\nRemaining unmatched in user data:")
    for i, title in enumerate(sorted(only_in_user_final), 1):
        print(f"  {i}. {title}")

print(f"\n🎉 HARMONIZATION COMPLETE! You should now have {len(common_final)} matching texts instead of 4!")


STEP 3: APPLY FINAL TITLE HARMONIZATION
HARMONIZING TITLES IN GOLDEN ANSWERS FILES

Processing: seconda-elementare_with_answers.json
  No changes needed

Processing: quinta-elementare_with_answers.json
  No changes needed

Processing: terza-elementare_with_answers.json
  No changes needed

Processing: quarta-elementare_with_answers.json
  No changes needed

Harmonization complete!

STEP 4: FINAL VERIFICATION
Scanning directory: /Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers
Found 4 JSON file(s):

Processing: seconda-elementare_with_answers.json
  Found 14 titles:
    1. una sciarpa per la scuola
    2. giovanni l’inventore
    3. una bella sorpresa
    4. un fantasma a pallini
    5. il mago dormiglione
    6. l'ora di punta
    7. tutti in gita!
    8. troppo bianchi
    9. zanzare
    10. dov'è biancaneve?
    11. un vento bellissimo
    12. un libro divertente
    13. dino resta a casa
    14. l'orologio

Processing: quinta-elementare_with_answers.json
  Fo

In [15]:
def harmonize_question_texts():
    """Harmonize question texts in both user data and golden answers files to fix typos."""
    
    # Define question text corrections
    question_corrections = {
        "Cosa fa la penna quando l'autirce cerca di fermarsi?": "Cosa fa la penna quando l'autrice cerca di fermarsi?",
        # Add more question corrections here as needed
    }
    
    print("HARMONIZING QUESTION TEXTS")
    print("=" * 50)
    
    # Process user data files
    user_dir = Path("/Users/Martina.Galletti/Downloads/dati-artis1/dati-json")
    print(f"\nProcessing USER DATA files in: {user_dir}")
    
    for json_file in user_dir.glob("*.json"):
        print(f"\n  Processing: {json_file.name}")
        
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            changes_made = 0
            
            # Handle different JSON structures for user data
            if "stories" in data:
                for story in data["stories"]:
                    questions = story.get("questions", [])
                    if not questions and "exercises" in story:
                        for exercise in story["exercises"]:
                            questions.extend(exercise.get("questions", []))
                    
                    for q in questions:
                        original_question = q.get("question", "").strip()
                        if original_question in question_corrections:
                            corrected_question = question_corrections[original_question]
                            q["question"] = corrected_question
                            print(f"    Fixed: '{original_question}' -> '{corrected_question}'")
                            changes_made += 1
                            
            elif "texts" in data:
                for text in data["texts"]:
                    if "exercises" in text:
                        for exercise in text["exercises"]:
                            questions = exercise.get("questions", [])
                            for q in questions:
                                original_question = q.get("question", "").strip()
                                if original_question in question_corrections:
                                    corrected_question = question_corrections[original_question]
                                    q["question"] = corrected_question
                                    print(f"    Fixed: '{original_question}' -> '{corrected_question}'")
                                    changes_made += 1
            
            # Save if changes were made
            if changes_made > 0:
                # Create backup
                backup_file = json_file.with_suffix('.json.questions_backup')
                if not backup_file.exists():
                    with open(json_file, 'r', encoding='utf-8') as f:
                        original_data = json.load(f)
                    with open(backup_file, 'w', encoding='utf-8') as f:
                        json.dump(original_data, f, ensure_ascii=False, indent=2)
                    print(f"    Backup created: {backup_file.name}")
                
                # Save corrected file
                with open(json_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)
                print(f"    File updated with {changes_made} question corrections")
            else:
                print(f"    No question corrections needed")
                
        except Exception as e:
            print(f"    Error processing {json_file.name}: {e}")
    
    # Process golden answers files
    golden_dir = Path("/Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers")
    print(f"\nProcessing GOLDEN ANSWERS files in: {golden_dir}")
    
    for json_file in golden_dir.glob("*.json"):
        print(f"\n  Processing: {json_file.name}")
        
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            changes_made = 0
            
            # Handle different JSON structures for golden data
            if "stories" in data:
                for story in data["stories"]:
                    questions = story.get("questions", [])
                    if not questions and "exercises" in story:
                        for exercise in story["exercises"]:
                            questions.extend(exercise.get("questions", []))
                    
                    for q in questions:
                        original_question = q.get("question", "").strip()
                        if original_question in question_corrections:
                            corrected_question = question_corrections[original_question]
                            q["question"] = corrected_question
                            print(f"    Fixed: '{original_question}' -> '{corrected_question}'")
                            changes_made += 1
                            
            elif "texts" in data:
                for text in data["texts"]:
                    if "exercises" in text:
                        for exercise in text["exercises"]:
                            questions = exercise.get("questions", [])
                            for q in questions:
                                original_question = q.get("question", "").strip()
                                if original_question in question_corrections:
                                    corrected_question = question_corrections[original_question]
                                    q["question"] = corrected_question
                                    print(f"    Fixed: '{original_question}' -> '{corrected_question}'")
                                    changes_made += 1
            
            # Save if changes were made
            if changes_made > 0:
                # Create backup
                backup_file = json_file.with_suffix('.json.questions_backup')
                if not backup_file.exists():
                    with open(json_file, 'r', encoding='utf-8') as f:
                        original_data = json.load(f)
                    with open(backup_file, 'w', encoding='utf-8') as f:
                        json.dump(original_data, f, ensure_ascii=False, indent=2)
                    print(f"    Backup created: {backup_file.name}")
                
                # Save corrected file
                with open(json_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)
                print(f"    File updated with {changes_made} question corrections")
            else:
                print(f"    No question corrections needed")
                
        except Exception as e:
            print(f"    Error processing {json_file.name}: {e}")
    
    print(f"\n✅ Question text harmonization complete!")

def verify_question_harmonization():
    """Verify that question harmonization was successful."""
    
    print("\nVERIFYING QUESTION HARMONIZATION")
    print("=" * 40)
    
    # Check both directories for the corrected question
    target_question = "Cosa fa la penna quando l'autrice cerca di fermarsi?"
    old_question = "Cosa fa la penna quando l'autirce cerca di fermarsi?"
    
    found_correct = 0
    found_incorrect = 0
    
    # Check user data
    user_dir = Path("/Users/Martina.Galletti/Downloads/dati-artis1/dati-json")
    for json_file in user_dir.glob("*.json"):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Search for the question in the file content
            file_content = json.dumps(data, ensure_ascii=False)
            if target_question in file_content:
                found_correct += 1
                print(f"✅ Found corrected question in: {json_file.name}")
            elif old_question in file_content:
                found_incorrect += 1
                print(f"❌ Found old typo in: {json_file.name}")
                
        except Exception as e:
            print(f"Error checking {json_file.name}: {e}")
    
    # Check golden answers
    golden_dir = Path("/Users/Martina.Galletti/Downloads/dati-artis1/json-with-correct-answers")
    for json_file in golden_dir.glob("*.json"):
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Search for the question in the file content
            file_content = json.dumps(data, ensure_ascii=False)
            if target_question in file_content:
                found_correct += 1
                print(f"✅ Found corrected question in: {json_file.name}")
            elif old_question in file_content:
                found_incorrect += 1
                print(f"❌ Found old typo in: {json_file.name}")
                
        except Exception as e:
            print(f"Error checking {json_file.name}: {e}")
    
    print(f"\nSUMMARY:")
    print(f"Files with correct question text: {found_correct}")
    print(f"Files with old typo: {found_incorrect}")
    
    if found_incorrect == 0:
        print("🎉 All question typos have been fixed!")
    else:
        print("⚠️  Some files still contain the old typo")

# STEP 5: HARMONIZE QUESTION TEXTS
print("\n" + "="*60)
print("STEP 5: HARMONIZE QUESTION TEXTS")
print("="*60)

# Execute question text harmonization
harmonize_question_texts()

# Verify the harmonization worked
verify_question_harmonization()

print(f"\n🎯 COMPLETE HARMONIZATION FINISHED!")
print("Both titles and question texts have been harmonized.")
print("Your analysis should now find the missing question!")


STEP 5: HARMONIZE QUESTION TEXTS
HARMONIZING QUESTION TEXTS

Processing USER DATA files in: /Users/Martina.Galletti/Downloads/dati-artis1/dati-json

  Processing: risultati-user12.json
    No question corrections needed

  Processing: risultati-user24.json
    No question corrections needed

  Processing: risultati-user1.json
    No question corrections needed

  Processing: risultati-user18.json
    No question corrections needed

  Processing: risultati-user15.json
    No question corrections needed

  Processing: risultati-user9.json
    No question corrections needed

  Processing: risultati-user8.json
    No question corrections needed

  Processing: risultati-user4.json
    No question corrections needed

  Processing: risultati-user10.json
    No question corrections needed

  Processing: risultati-user3.json
    No question corrections needed

  Processing: risultati-user2.json
    No question corrections needed

  Processing: risultati-user11.json
    No question corrections 