In [8]:
import boto3, json, csv, gzip, io, sys
from botocore.exceptions import ClientError


BUCKET  = "asrelder-data"      
PREFIX  = "whisper_batch/output/dementiabank/"
OUTFILE = "dementia_transcripts.csv"

s3 = boto3.client("s3")

def iter_out_keys(bucket, prefix=""):
    paginator = s3.get_paginator("list_objects_v2")
    kwargs = {"Bucket": bucket}
    if prefix:
        kwargs["Prefix"] = prefix
    for page in paginator.paginate(**kwargs):
        for obj in (page.get("Contents") or []):
            key = obj["Key"]
            if key.endswith(".out"):
                yield key

def read_object_text(bucket, key):
    resp = s3.get_object(Bucket=bucket, Key=key)
    body = resp["Body"].read()
    if body[:2] == b"\x1f\x8b":
        with gzip.GzipFile(fileobj=io.BytesIO(body)) as gz:
            return gz.read().decode("utf-8", errors="replace")
    return body.decode("utf-8-sig", errors="replace")

def parse_text_field(raw):
    s = raw.strip()
    try:
        obj = json.loads(s)
        if isinstance(obj, dict):
            return obj.get("text", "")
    except json.JSONDecodeError:
        pass
    # If the whole JSON is quoted as a string (rare), try again
    if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
        try:
            obj = json.loads(s[1:-1])
            if isinstance(obj, dict):
                return obj.get("text", "")
        except json.JSONDecodeError:
            pass
    return ""

rows = 0
with open(OUTFILE, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "transcribed_text"])
    for key in iter_out_keys(BUCKET, PREFIX):
        try:
            raw = read_object_text(BUCKET, key)
            text = parse_text_field(raw)
            writer.writerow([key, text])
            rows += 1
        except ClientError as e:
            print(f"[ERROR] {key}: {e}", file=sys.stderr)

print(f"Done. Wrote {rows} rows to {OUTFILE}")

Done. Wrote 112 rows to dementia_transcripts.csv
