# Audio Path Corrector

This notebook updates audio paths in JSON files to only include the BKB filename.

In [None]:
import json
import os
import re
import glob

In [None]:
# Base directory for BKB Transcriptions
base_dir = 'BKB_Transcriptions'
print(f"Base directory: {base_dir}")

# Ensure we're in the right directory
if not os.path.exists(base_dir):
    base_dir = os.path.join('file_storage', 'BKB_Transcriptions')
    print(f"Updated base directory: {base_dir}")

if not os.path.exists(base_dir):
    print("Warning: BKB_Transcriptions directory not found. Please adjust the path.")

In [None]:
# Find all JSON files
json_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))
            
print(f"Found {len(json_files)} JSON files")
for i, file in enumerate(json_files[:5]):
    print(f"{i+1}. {file}")
if len(json_files) > 5:
    print(f"... and {len(json_files) - 5} more files")

In [None]:
def extract_bkb_filename(audio_path):
    """
    Extract just the BKB filename from the audio path
    """
    # Extract the filename that starts with BKB_
    match = re.search(r'(BKB_[^/\\]*\.wav)', audio_path)
    if match:
        return match.group(1)
    
    # If no match, just get the base filename if it starts with BKB_
    filename = os.path.basename(audio_path)
    if filename.startswith('BKB_'):
        return filename
    
    # If still no BKB_ filename found, return the original
    return audio_path

In [None]:
def update_json_file(file_path):
    """
    Update audio_path in a JSON file to just include the BKB_ filename
    """
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        modified = False
        original_path = ""
        new_path = ""
        
        # Check if this is a direct file with audio_path
        if 'audio_path' in data:
            original_path = data['audio_path']
            bkb_filename = extract_bkb_filename(original_path)
            data['audio_path'] = bkb_filename
            new_path = bkb_filename
            modified = True
        
        # Check other formats like AudioPathName
        if 'AudioPathName' in data:
            original_path = data['AudioPathName']
            bkb_filename = extract_bkb_filename(original_path)
            data['AudioPathName'] = bkb_filename
            new_path = bkb_filename
            modified = True
        
        # Check if it's a collection with Items
        if 'Items' in data and isinstance(data['Items'], list):
            for item in data['Items']:
                if 'AudioPathName' in item:
                    if not original_path:  # Keep first one for sample
                        original_path = item['AudioPathName']
                    bkb_filename = extract_bkb_filename(item['AudioPathName'])
                    item['AudioPathName'] = bkb_filename
                    if not new_path:  # Keep first one for sample
                        new_path = bkb_filename
                    modified = True
        
        # Check if it has a words array with audio_path
        if 'words' in data and isinstance(data['words'], list):
            for word in data['words']:
                if 'audio_path' in word:
                    if not original_path:  # Keep first one for sample
                        original_path = word['audio_path']
                    bkb_filename = extract_bkb_filename(word['audio_path'])
                    word['audio_path'] = bkb_filename
                    if not new_path:  # Keep first one for sample
                        new_path = bkb_filename
                    modified = True
        
        # If modified, write back the changes
        if modified:
            with open(file_path, 'w') as f:
                json.dump(data, f, indent=2)
            
            if not original_path:
                original_path = "Multiple paths"
            if not new_path:
                new_path = "updated to BKB_ filenames"
                
            return True, original_path, new_path
        
        return False, "No paths found", ""
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return False, str(e), ""

In [None]:
# Process all files
updated_files = 0
sample_updates = []

for i, file_path in enumerate(json_files):
    modified, original, new_path = update_json_file(file_path)
    if modified:
        updated_files += 1
        if len(sample_updates) < 5:
            sample_updates.append((file_path, original, new_path))
    
    # Print progress every 100 files
    if (i + 1) % 100 == 0 or i == len(json_files) - 1:
        print(f"Processed {i + 1}/{len(json_files)} files, updated {updated_files} files")

print(f"\nCompleted processing {len(json_files)} files")
print(f"Updated audio paths in {updated_files} files")

# Show some examples of updates
print("\nSample updates:")
for file_path, original, new_path in sample_updates:
    print(f"\nFile: {file_path}")
    print(f"  Original: {original}")
    print(f"  Updated:  {new_path}")