# Audio Path Standardizer

This notebook standardizes all audio paths in JSON files to the format "BKB_EHF2254_0101.wav".

In [None]:
import json
import os
import re
import glob

In [None]:
# Base directory for BKB Transcriptions
base_dir = 'BKB_Transcriptions'
print(f"Base directory: {base_dir}")

# Ensure we're in the right directory
if not os.path.exists(base_dir):
    base_dir = os.path.join('file_storage', 'BKB_Transcriptions')
    print(f"Updated base directory: {base_dir}")

if not os.path.exists(base_dir):
    print("Warning: BKB_Transcriptions directory not found. Please adjust the path.")

In [None]:
# Find all JSON files
json_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))
            
print(f"Found {len(json_files)} JSON files")
for i, file in enumerate(json_files[:5]):
    print(f"{i+1}. {file}")
if len(json_files) > 5:
    print(f"... and {len(json_files) - 5} more files")

In [None]:
def standardize_audio_path(audio_path):
    """
    Standardize audio path to format "BKB_EHF2254_0101.wav"
    """
    # Extract the filename without path
    filename = os.path.basename(audio_path)
    
    # Extract the EHF pattern and number
    match = re.search(r'(EHF\d+_\d+\.wav)', filename)
    if match:
        # Add BKB_ prefix if not present
        ehf_filename = match.group(1)
        if not ehf_filename.startswith('BKB_'):
            return f"BKB_{ehf_filename}"
        return ehf_filename
    
    # If no EHF pattern in filename, check the path
    match = re.search(r'(EHF\d+).*?(\d{4})\.wav', audio_path)
    if match:
        ehf_id = match.group(1)
        number = match.group(2)
        return f"BKB_{ehf_id}_{number}.wav"
    
    # If still can't extract pattern, just return the original with BKB_ prefix if needed
    if not filename.startswith('BKB_'):
        return f"BKB_{filename}"
    return filename

In [None]:
def update_json_file(file_path):
    """
    Update audio_path in a JSON file to standard BKB format
    """
    try:
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        modified = False
        original_path = ""
        new_path = ""
        
        # Check if this is a direct file with audio_path
        if 'audio_path' in data:
            original_path = data['audio_path']
            standardized_path = standardize_audio_path(original_path)
            data['audio_path'] = standardized_path
            new_path = standardized_path
            modified = True
        
        # Check other formats like AudioPathName
        if 'AudioPathName' in data:
            original_path = data['AudioPathName']
            standardized_path = standardize_audio_path(original_path)
            data['AudioPathName'] = standardized_path
            new_path = standardized_path
            modified = True
        
        # Check if it's a collection with Items
        if 'Items' in data and isinstance(data['Items'], list):
            for item in data['Items']:
                if 'AudioPathName' in item:
                    if not original_path:  # Keep first one for sample
                        original_path = item['AudioPathName']
                    standardized_path = standardize_audio_path(item['AudioPathName'])
                    item['AudioPathName'] = standardized_path
                    if not new_path:  # Keep first one for sample
                        new_path = standardized_path
                    modified = True
        
        # Check if it has a words array with audio_path
        if 'words' in data and isinstance(data['words'], list):
            for word in data['words']:
                if 'audio_path' in word:
                    if not original_path:  # Keep first one for sample
                        original_path = word['audio_path']
                    standardized_path = standardize_audio_path(word['audio_path'])
                    word['audio_path'] = standardized_path
                    if not new_path:  # Keep first one for sample
                        new_path = standardized_path
                    modified = True
        
        # If modified, write back the changes
        if modified:
            with open(file_path, 'w') as f:
                json.dump(data, f, indent=2)
            
            if not original_path:
                original_path = "Multiple paths"
            if not new_path:
                new_path = "standardized to BKB_EHF####_####.wav format"
                
            return True, original_path, new_path
        
        return False, "No paths found", ""
        
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return False, str(e), ""

In [None]:
# Process all files
updated_files = 0
sample_updates = []

for i, file_path in enumerate(json_files):
    modified, original, new_path = update_json_file(file_path)
    if modified:
        updated_files += 1
        if len(sample_updates) < 5:
            sample_updates.append((file_path, original, new_path))
    
    # Print progress every 100 files
    if (i + 1) % 100 == 0 or i == len(json_files) - 1:
        print(f"Processed {i + 1}/{len(json_files)} files, updated {updated_files} files")

print(f"\nCompleted processing {len(json_files)} files")
print(f"Updated audio paths in {updated_files} files")

# Show some examples of updates
print("\nSample updates:")
for file_path, original, new_path in sample_updates:
    print(f"\nFile: {file_path}")
    print(f"  Original: {original}")
    print(f"  Updated:  {new_path}")

In [None]:
# Verify a few files to make sure they match the expected format
def verify_format():
    test_cases = [
        "21lists_44100/Female/EHF2251/EHF2251_0103.wav",
        "EHF2254_0101.wav",
        "BKB_EHF2254_0101.wav",
        "/Users/suriyakumar/Documents/Brain Games/BGCScience/BKB/BKB_Mono_Full_Raw/BKB_21lists_44100/Female/EHF2251/BKB_EHF2251_0107.wav"
    ]
    
    print("Verification of standardization function:")
    for test in test_cases:
        standardized = standardize_audio_path(test)
        print(f"  Original: {test}")
        print(f"  Standardized: {standardized}")
        print()

verify_format()