# Imports

In [7]:
import os
import pandas as pd
from pathlib import Path
from typing import List, Dict
from datetime import datetime

# Define paths
AUDIO_DIR = Path("../data/audio")
TRANSCRIPT_DIR = Path("../data/transcripts")

# Create transcripts directory if it doesn't exist
TRANSCRIPT_DIR.mkdir(parents=True, exist_ok=True)

In [8]:
def get_audio_files() -> List[Dict]:
    """
    Get a list of audio files and their corresponding transcript paths.
    Returns a list of dictionaries containing file information.
    """
    audio_files = []

    for audio_file in sorted(AUDIO_DIR.glob("*.m4a")):
        # Extract the base name without extension
        base_name = audio_file.stem
        # Create corresponding transcript path
        transcript_path = TRANSCRIPT_DIR / f"{base_name}.md"

        audio_files.append(
            {
                "audio_file": audio_file,
                "transcript_path": transcript_path,
                "base_name": base_name,
            }
        )

    return audio_files


# Get list of audio files
audio_files = get_audio_files()

# Display available files
for file_info in audio_files:
    print(f"Audio: {file_info['audio_file'].name}")
    print(f"Transcript: {file_info['transcript_path'].name}")
    print("-" * 50)

Audio: Morning thoughts 1.m4a
Transcript: Morning thoughts 1.md
--------------------------------------------------
Audio: Morning thoughts 2.m4a
Transcript: Morning thoughts 2.md
--------------------------------------------------
Audio: Morning thoughts 3.m4a
Transcript: Morning thoughts 3.md
--------------------------------------------------
Audio: Morning thoughts 4.m4a
Transcript: Morning thoughts 4.md
--------------------------------------------------
Audio: Morning thoughts 5.m4a
Transcript: Morning thoughts 5.md
--------------------------------------------------
Audio: Morning thoughts 6.m4a
Transcript: Morning thoughts 6.md
--------------------------------------------------
Audio: Morning thoughts 7.m4a
Transcript: Morning thoughts 7.md
--------------------------------------------------


In [9]:
def parse_markdown_content(content: str) -> List[Dict]:
    """
    Parse markdown content into structured data.
    Returns a list of dictionaries containing parsed content.
    """
    sections = []
    current_section = {"text": [], "metadata": {}}
    
    lines = content.split('\n')
    for line in lines:
        # Skip empty lines
        if not line.strip():
            continue
            
        # Check for metadata (assuming metadata is in format "Key: Value")
        if ': ' in line and not line.startswith('#'):
            key, value = line.split(': ', 1)
            current_section["metadata"][key.strip()] = value.strip()
        # Check for headers
        elif line.startswith('#'):
            # If we have content in the current section, save it
            if current_section["text"] or current_section["metadata"]:
                sections.append(current_section)
                current_section = {"text": [], "metadata": {}}
            current_section["metadata"]["header"] = line.lstrip('#').strip()
        else:
            current_section["text"].append(line.strip())
    
    # Add the last section if it has content
    if current_section["text"] or current_section["metadata"]:
        sections.append(current_section)
    
    return sections

def read_transcript(transcript_path: Path) -> List[Dict]:
    """
    Read a markdown transcript file and return its structured contents.
    Returns an empty list if the file doesn't exist.
    """
    if transcript_path.exists():
        with open(transcript_path, "r", encoding='utf-8') as f:
            content = f.read()
            return parse_markdown_content(content)
    return []

def display_transcript_content(transcript_data: List[Dict]):
    """
    Display the contents of a transcript in a formatted way.
    """
    if not transcript_data:
        print("No transcript data available.")
        return

    print("\n=== Transcript Content ===\n")
    
    for section in transcript_data:
        # Display metadata
        if section["metadata"]:
            print("Metadata:")
            for key, value in section["metadata"].items():
                print(f"  {key}: {value}")
        
        # Display text content
        if section["text"]:
            print("\nContent:")
            print("\n".join(section["text"]))
        
        print("\n" + "-"*50 + "\n")

# Example usage:
# Choose the first transcript to display
if audio_files:
    first_transcript = read_transcript(audio_files[0]["transcript_path"])
    print(f"Displaying transcript for: {audio_files[0]['base_name']}")
    display_transcript_content(first_transcript)

Displaying transcript for: Morning thoughts 1

=== Transcript Content ===


Content:
. Okay, well, I'd better get started. This will be my first recording in a series of recordings. that I'm doing a little experiment on, actually. I'm gonna see if I can take all of these recordings and then do some sort of analysis on this. The reason I'm doing it is because I think with large language models, the ability to do such an analysis often a large amount of tax. has become possible in the last year or so., maybe this was not the best part to come down there for so many crows. Okay. And I just think it would be interesting to see what sort of analysis I could make on such a corpus of text. It's a little bit awkward for me. I mean, I don't know exactly what I'm going to say. They're going to be. He's, they're going to be very free form. No particular topic. Just talking about life, the future, whatever I feel like, to be honest. Hopefully the analysis at the end will be able to pass what I wan