# Imports

In [1]:
import os
import json
import pandas as pd
from pathlib import Path
from typing import List, Dict

# Define paths
AUDIO_DIR = Path("../data/audio")
TRANSCRIPT_DIR = Path("../data/transcripts")

# Create transcripts directory if it doesn't exist
TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
def get_audio_files() -> List[Dict]:
    """
    Get a list of audio files and their corresponding transcript paths.
    Returns a list of dictionaries containing file information.
    """
    audio_files = []

    for audio_file in sorted(AUDIO_DIR.glob("*.m4a")):
        # Extract the base name without extension
        base_name = audio_file.stem
        # Create corresponding transcript path
        transcript_path = TRANSCRIPT_DIR / f"{base_name}.json"

        audio_files.append(
            {
                "audio_file": audio_file,
                "transcript_path": transcript_path,
                "base_name": base_name,
            }
        )

    return audio_files


# Get list of audio files
audio_files = get_audio_files()

# Display available files
for file_info in audio_files:
    print(f"Audio: {file_info['audio_file'].name}")
    print(f"Transcript: {file_info['transcript_path'].name}")
    print("-" * 50)

Audio: Morning thoughts 1.m4a
Transcript: Morning thoughts 1.json
--------------------------------------------------
Audio: Morning thoughts 2.m4a
Transcript: Morning thoughts 2.json
--------------------------------------------------
Audio: Morning thoughts 3.m4a
Transcript: Morning thoughts 3.json
--------------------------------------------------
Audio: Morning thoughts 4.m4a
Transcript: Morning thoughts 4.json
--------------------------------------------------
Audio: Morning thoughts 5.m4a
Transcript: Morning thoughts 5.json
--------------------------------------------------
Audio: Morning thoughts 6.m4a
Transcript: Morning thoughts 6.json
--------------------------------------------------


In [3]:
def read_transcript(transcript_path: Path) -> Dict:
    """
    Read a transcript file and return its contents.
    Returns an empty dict if the file doesn't exist.
    """
    if transcript_path.exists():
        with open(transcript_path, "r") as f:
            return json.load(f)
    return {}


def display_transcript_content(transcript_data: Dict):
    """
    Display the contents of a transcript in a formatted way.
    """
    if not transcript_data:
        print("No transcript data available.")
        return

    # Create a DataFrame for better display
    # Assuming the transcript data has a text field and timestamp
    # Modify this based on your actual transcript structure
    df = pd.DataFrame(transcript_data)
    display(df)


# Example usage:
# Choose the first transcript to display
if audio_files:
    first_transcript = read_transcript(audio_files[0]["transcript_path"])
    print(f"Displaying transcript for: {audio_files[0]['base_name']}")
    display_transcript_content(first_transcript)

Displaying transcript for: Morning thoughts 1
No transcript data available.
