# Audio Path Prefix Removal

This notebook removes the 'BKB_' prefix from audio paths in all JSON files.

In [None]:
import json
import os
import glob
import re

In [None]:
# Base directory for BKB Transcriptions
base_dir = 'BKB_Transcriptions'
print(f"Base directory: {base_dir}")

# Ensure we're in the right directory
if not os.path.exists(base_dir):
    base_dir = os.path.join('file_storage', 'BKB_Transcriptions')
    print(f"Updated base directory: {base_dir}")

if not os.path.exists(base_dir):
    print("Warning: BKB_Transcriptions directory not found. Please adjust the path.")

In [None]:
# Find all JSON files
json_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))
            
print(f"Found {len(json_files)} JSON files")
for i, file in enumerate(json_files[:5]):
    print(f"{i+1}. {file}")
if len(json_files) > 5:
    print(f"... and {len(json_files) - 5} more files")

In [None]:
def remove_bkb_prefix(audio_path):
    """
    Remove 'BKB_' prefix from the audio path
    """
    # Replace BKB_ with an empty string
    return audio_path.replace('BKB_', '')

In [None]:
def update_json_file(file_path):
    """
    Update audio_path in a JSON file to remove BKB_ prefix
    """
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        modified = False
        examples = []
        
        # Check if this is a direct file with audio_path
        if 'audio_path' in data:
            original_path = data['audio_path']
            updated_path = remove_bkb_prefix(original_path)
            data['audio_path'] = updated_path
            modified = True
            examples.append((original_path, updated_path))
        
        # Check other formats like AudioPathName
        if 'AudioPathName' in data:
            original_path = data['AudioPathName']
            updated_path = remove_bkb_prefix(original_path)
            data['AudioPathName'] = updated_path
            modified = True
            examples.append((original_path, updated_path))
        
        # Check if it's a collection with Items
        if 'Items' in data and isinstance(data['Items'], list):
            for item in data['Items']:
                if 'AudioPathName' in item:
                    original_path = item['AudioPathName']
                    updated_path = remove_bkb_prefix(original_path)
                    item['AudioPathName'] = updated_path
                    modified = True
                    if len(examples) < 3:  # Limit examples to avoid clutter
                        examples.append((original_path, updated_path))
        
        # Check if it has a words array with audio_path
        if 'words' in data and isinstance(data['words'], list):
            for word in data['words']:
                if 'audio_path' in word:
                    original_path = word['audio_path']
                    updated_path = remove_bkb_prefix(original_path)
                    word['audio_path'] = updated_path
                    modified = True
                    if len(examples) < 3:  # Limit examples
                        examples.append((original_path, updated_path))
        
        # If modified, write back the changes
        if modified:
            with open(file_path, 'w') as f:
                json.dump(data, f, indent=2)
            
            return modified, examples
        
        return False, []
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return False, []

In [None]:
# Process all files
updated_files = 0
all_examples = []

for i, file_path in enumerate(json_files):
    modified, examples = update_json_file(file_path)
    if modified:
        updated_files += 1
        if len(all_examples) < 5 and examples:  # Keep a few examples to show
            all_examples.append((file_path, examples))
    
    # Print progress every 100 files
    if (i + 1) % 100 == 0 or i == len(json_files) - 1:
        print(f"Processed {i + 1}/{len(json_files)} files, updated {updated_files} files")

print(f"\nCompleted processing {len(json_files)} files")
print(f"Updated audio paths in {updated_files} files")

# Show some examples of updates
print("\nSample updates:")
for file_path, examples in all_examples:
    print(f"\nFile: {file_path}")
    for original, updated in examples:
        print(f"  Original: {original}")
        print(f"  Updated:  {updated}")