# Audio Path Corrector

This notebook converts absolute audio paths to relative paths in BKB transcription JSON files.

In [None]:
import json
import os
import re
import glob

In [None]:
# Base directory for BKB Transcriptions
base_dir = 'BKB_Transcriptions'
print(f"Base directory: {base_dir}")

# Ensure we're in the right directory
if not os.path.exists(base_dir):
    base_dir = os.path.join('file_storage', 'BKB_Transcriptions')
    print(f"Updated base directory: {base_dir}")

if not os.path.exists(base_dir):
    print("Warning: BKB_Transcriptions directory not found. Please adjust the path.")

In [None]:
# Find all JSON files
json_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))
            
print(f"Found {len(json_files)} JSON files")
for i, file in enumerate(json_files[:5]):
    print(f"{i+1}. {file}")
if len(json_files) > 5:
    print(f"... and {len(json_files) - 5} more files")

In [None]:
def convert_to_relative_path(audio_path):
    """
    Convert absolute audio path to relative path starting from BKB_21lists_44100 or similar
    """
    # Match pattern like BKB_21lists_44100 or similar patterns
    match = re.search(r'BKB_\d+lists_\d+.*?\.wav$', audio_path)
    if match:
        return match.group(0)
    
    # If the specific pattern is not found, try a more general approach
    parts = audio_path.split('/')
    for i, part in enumerate(parts):
        if part.startswith('BKB_') and 'lists_' in part:
            return '/'.join(parts[i:])
    
    # If still not found, just return the filename
    return os.path.basename(audio_path)

In [None]:
def update_json_file(file_path):
    """
    Update audio_path in a JSON file to use relative paths
    """
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        modified = False
        
        # Check if this is a direct file with audio_path
        if 'audio_path' in data:
            original_path = data['audio_path']
            relative_path = convert_to_relative_path(original_path)
            data['audio_path'] = relative_path
            modified = True
            return modified, original_path, relative_path
        
        # Check other formats like AudioPathName
        if 'AudioPathName' in data:
            original_path = data['AudioPathName']
            relative_path = convert_to_relative_path(original_path)
            data['AudioPathName'] = relative_path
            modified = True
            
            # Write back the updated data
            with open(file_path, 'w') as f:
                json.dump(data, f, indent=2)
                
            return modified, original_path, relative_path
        
        # Check if it's a collection with Items
        if 'Items' in data and isinstance(data['Items'], list):
            for item in data['Items']:
                if 'AudioPathName' in item:
                    original_path = item['AudioPathName']
                    relative_path = convert_to_relative_path(original_path)
                    item['AudioPathName'] = relative_path
                    modified = True
        
        # Check if it has a words array with audio_path
        if 'words' in data and isinstance(data['words'], list):
            for word in data['words']:
                if 'audio_path' in word:
                    original_path = word['audio_path']
                    relative_path = convert_to_relative_path(original_path)
                    word['audio_path'] = relative_path
                    modified = True
        
        # If modified, write back the changes
        if modified:
            with open(file_path, 'w') as f:
                json.dump(data, f, indent=2)
            
            return modified, "Multiple paths updated", "to relative paths"
        
        return False, "No paths found", ""
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return False, str(e), ""

In [None]:
# Process all files
updated_files = 0
sample_updates = []

for i, file_path in enumerate(json_files):
    modified, original, relative = update_json_file(file_path)
    if modified:
        updated_files += 1
        if len(sample_updates) < 5:
            sample_updates.append((file_path, original, relative))
    
    # Print progress every 100 files
    if (i + 1) % 100 == 0 or i == len(json_files) - 1:
        print(f"Processed {i + 1}/{len(json_files)} files, updated {updated_files} files")

print(f"\nCompleted processing {len(json_files)} files")
print(f"Updated audio paths in {updated_files} files")

# Show some examples of updates
print("\nSample updates:")
for file_path, original, relative in sample_updates:
    print(f"\nFile: {file_path}")
    print(f"  Original: {original}")
    print(f"  Updated:  {relative}")