# BKB Transcription Audio Path Converter

This notebook converts absolute audio paths to relative paths in BKB transcription JSON files.

In [None]:
import json
import os
import glob
from pathlib import Path
import re

In [None]:
# Base directory for BKB Transcriptions
base_dir = 'BKB_Transcriptions'
print(f"Base directory: {base_dir}")

In [None]:
# Find all transcription JSON files
json_files = []
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.json'):
            json_files.append(os.path.join(root, file))
            
print(f"Found {len(json_files)} JSON files")
for i, file in enumerate(json_files[:5]):
    print(f"{i+1}. {file}")
if len(json_files) > 5:
    print(f"... and {len(json_files) - 5} more files")

In [None]:
def convert_audio_paths(json_file_path):
    """
    Convert absolute audio paths to relative paths in a JSON file
    """
    try:
        # Read the JSON file
        with open(json_file_path, 'r') as f:
            data = json.load(f)
        
        modified = False
        
        # Check if this is a file with a single audio_path
        if 'audio_path' in data:
            # Extract the relative path starting with BKB_*lists_*
            original_path = data['audio_path']
            match = re.search(r'(BKB_\d+lists_\d+/.*\.wav)', original_path)
            if match:
                relative_path = match.group(1)
                data['audio_path'] = relative_path
                modified = True
                print(f"  Modified path: {original_path} -> {relative_path}")
        
        # Check if this is a file with AudioPathName
        if 'AudioPathName' in data:
            original_path = data['AudioPathName']
            match = re.search(r'(BKB_\d+lists_\d+/.*\.wav)', original_path)
            if match:
                relative_path = match.group(1)
                data['AudioPathName'] = relative_path
                modified = True
                print(f"  Modified path: {original_path} -> {relative_path}")
        
        # Check for Items array structure
        if 'Items' in data and isinstance(data['Items'], list):
            for item in data['Items']:
                if 'AudioPathName' in item:
                    original_path = item['AudioPathName']
                    match = re.search(r'(BKB_\d+lists_\d+/.*\.wav)', original_path)
                    if match:
                        relative_path = match.group(1)
                        item['AudioPathName'] = relative_path
                        modified = True
                        print(f"  Modified path: {original_path} -> {relative_path}")
        
        # Save the file if modifications were made
        if modified:
            with open(json_file_path, 'w') as f:
                json.dump(data, f, indent=2)
            return True
        else:
            return False
            
    except Exception as e:
        print(f"Error processing {json_file_path}: {str(e)}")
        return False

In [None]:
# Process all files
modified_files = 0
processed_files = 0

for json_file in json_files:
    print(f"Processing {json_file}...")
    if convert_audio_paths(json_file):
        modified_files += 1
    processed_files += 1

print(f"\nCompleted processing {processed_files} files")
print(f"Modified {modified_files} files")

## Verification

Let's check a few of the modified files to verify the paths were updated correctly:

In [None]:
# Display a few examples of modified files
sample_count = min(5, len(json_files))
for i, file in enumerate(json_files[:sample_count]):
    print(f"\nSample {i+1}: {file}")
    try:
        with open(file, 'r') as f:
            sample_data = json.load(f)
            
            # Check different possible structures
            if 'audio_path' in sample_data:
                print(f"  Audio Path: {sample_data['audio_path']}")
            
            if 'AudioPathName' in sample_data:
                print(f"  Audio Path: {sample_data['AudioPathName']}")
            
            if 'Items' in sample_data and len(sample_data['Items']) > 0:
                print(f"  First Item Audio Path: {sample_data['Items'][0].get('AudioPathName', 'Not found')}")
            
    except Exception as e:
        print(f"  Error reading file: {str(e)}")