# Remove 'text' key from JSON files in predictions_random

This notebook removes the 'text' key from all JSON files in the predictions_random directory and its subdirectories.

In [48]:
import json
import os
from pathlib import Path

In [49]:
# Define the base directory to search
base_dir = Path('predictions_random')

# Counter for statistics
files_processed = 0
files_modified = 0
errors = []

In [50]:
# Find all JSON files recursively in predictions_random
if base_dir.exists():
    json_files = list(base_dir.rglob('*.json'))
    print(f"Searching in {base_dir}...")
    print(f"Found {len(json_files)} JSON files in predictions_random")
else:
    print(f"Directory {base_dir} does not exist!")
    json_files = []

# Show some example file paths to verify coverage
if json_files:
    print("\nExample file paths found:")
    for i, json_file in enumerate(json_files[:10]):  # Show first 10 examples
        print(f"  {json_file}")

    if len(json_files) > 10:
        print(f"  ... and {len(json_files) - 10} more files")

    # Show directory structure
    directories = set()
    for json_file in json_files:
        directories.add(json_file.parent)

    print(f"\nDirectories containing JSON files: {len(directories)}")
    print("\nDirectory structure (showing first 20):")
    for i, directory in enumerate(sorted(directories)):
        if i >= 20:
            print(f"  ... and {len(directories) - 20} more directories")
            break
        file_count = len([f for f in json_files if f.parent == directory])
        print(f"  {directory}: {file_count} files")

Searching in predictions_random...
Found 660 JSON files in predictions_random

Example file paths found:
  predictions_random/seed_43/tasd/gemma3_27b/1.0/hotels/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/1.0/rest16/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/1.0/coursera/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/1.0/flightabsa/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/1.1/hotels/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/1.1/rest16/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/1.1/coursera/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/1.1/flightabsa/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/0.3/hotels/predictions.json
  predictions_random/seed_43/tasd/gemma3_27b/0.3/rest16/predictions.json
  ... and 650 more files

Directories containing JSON files: 660

Directory structure (showing first 20):
  predictions_random/seed_0/acd/gemma3_27b/0.1/

In [51]:
# Process each JSON file
for json_file in json_files:
    files_processed += 1
    try:
        # Read the JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # Track if any changes were made
        modified = False
        
        # Handle different JSON structures
        if isinstance(data, list):
            # If it's a list of objects
            for item in data:
                if isinstance(item, dict) and 'text' in item:
                    del item['text']
                    modified = True
        elif isinstance(data, dict):
            # If it's a single object
            if 'text' in data:
                del data['text']
                modified = True
        
        # Write back if modified
        if modified:
            with open(json_file, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=4, ensure_ascii=False)
            files_modified += 1
            print(f"✓ Modified: {json_file}")
        else:
            print(f"  Skipped (no 'text' key): {json_file}")
            
    except Exception as e:
        error_msg = f"Error processing {json_file}: {str(e)}"
        errors.append(error_msg)
        print(f"✗ {error_msg}")

  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/1.0/hotels/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/1.0/rest16/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/1.0/coursera/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/1.0/flightabsa/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/1.1/hotels/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/1.1/rest16/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/1.1/coursera/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/1.1/flightabsa/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/0.3/hotels/predictions.json
  Skipped (no 'text' key): predictions_random/seed_43/tasd/gemma3_27b/0.3/rest16/predictions.json
  Skippe

In [52]:
# Print summary
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f"Files processed: {files_processed}")
print(f"Files modified: {files_modified}")
print(f"Errors: {len(errors)}")

if errors:
    print("\nErrors encountered:")
    for error in errors:
        print(f"  - {error}")


SUMMARY
Files processed: 660
Files modified: 0
Errors: 0
