# Download Thorsten-Voice TV-44kHz-Full

This notebook downloads audio and builds a metadata table with direct links. Checkpoints and incremental progress are added for safe resume.

In [4]:
#!pip install -q --upgrade datasets huggingface_hub pandas tqdm pyarrow

In [5]:
from pathlib import Path
import json
import time

import pandas as pd
from datasets import Audio, load_dataset
from huggingface_hub import snapshot_download
from tqdm.auto import tqdm

REPO_ID = "Thorsten-Voice/TV-44kHz-Full"
CONFIG = "all"  # use full dataset; see available configs in error message
SPLIT = "train"
TOTAL_ROWS_HINT = 39200  # public row count for progress bar
CHUNK_SIZE = 1000  # rows per write/checkpoint batch

BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data_tv_44khz_full"
AUDIO_DIR = DATA_DIR / "audio"
METADATA_CSV = DATA_DIR / "tv_44khz_full_metadata.csv"
METADATA_PARQUET = DATA_DIR / "tv_44khz_full_metadata.parquet"
CHECKPOINT_PATH = DATA_DIR / "tv_44khz_full_checkpoint.json"
HF_BASE_URL = f"https://huggingface.co/datasets/{REPO_ID}/resolve/main"

DATA_DIR.mkdir(parents=True, exist_ok=True)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)

print(f"Working dir: {BASE_DIR}")
print(f"Data dir: {DATA_DIR}")

  from .autonotebook import tqdm as notebook_tqdm


Working dir: /Volumes/SSanDisk/SpeechRec-German
Data dir: /Volumes/SSanDisk/SpeechRec-German/data_tv_44khz_full


In [6]:
# Optional: Uncomment and run if the dataset requires authentication.
# from huggingface_hub import login
# login(token="<YOUR_HF_TOKEN>")

In [7]:
def load_checkpoint():
    if CHECKPOINT_PATH.exists():
        with open(CHECKPOINT_PATH, "r", encoding="utf-8") as f:
            try:
                return int(json.load(f).get("last_index", -1))
            except Exception:
                return -1
    return -1

def save_checkpoint(last_index: int):
    CHECKPOINT_PATH.write_text(
        json.dumps({"last_index": int(last_index), "updated": time.strftime("%Y-%m-%d %H:%M:%S")}, ensure_ascii=False),
        encoding="utf-8",
    )

def append_chunk(chunk_rows):
    if not chunk_rows:
        return
    header = not METADATA_CSV.exists()
    df = pd.DataFrame(chunk_rows)
    df.to_csv(METADATA_CSV, mode="a", header=header, index=False)

def row_to_meta(idx: int, row: dict):
    audio_path = row["audio"]["path"]
    rel = audio_path.replace("hf://datasets/", "")
    repo_prefix = f"{REPO_ID}/"
    subpath = rel[len(repo_prefix):] if rel.startswith(repo_prefix) else rel
    audio_url = f"{HF_BASE_URL}/{subpath}"
    local_path = str(AUDIO_DIR / subpath)

    return {
        "idx": idx,
        "id": row.get("id"),
        "subset": row.get("subset"),
        "style": row.get("style"),
        "text": row.get("text"),
        "samplerate": row.get("samplerate"),
        "durationSeconds": row.get("durationSeconds"),
        "recording_year_month": row.get("recording_year-month"),
        "microphone": row.get("microphone"),
        "language": row.get("language"),
        "comment": row.get("comment"),
        "audio_hub_path": audio_path,
        "audio_url": audio_url,
        "audio_local_path": local_path,
    }

def iter_dataset(start_from: int = 0):
    # Explicit config selection; load_dataset requires it for this repo
    ds = load_dataset(REPO_ID, CONFIG, split=SPLIT, streaming=True)
    ds = ds.cast_column("audio", Audio(decode=False))
    for idx, row in enumerate(ds):
        if idx < start_from:
            continue
        yield idx, row

In [8]:
start_from_checkpoint = load_checkpoint() + 1
start_from_csv = 0
if METADATA_CSV.exists():
    try:
        start_from_csv = len(pd.read_csv(METADATA_CSV))
    except Exception:
        start_from_csv = 0

start_from = max(start_from_checkpoint, start_from_csv, 0)
print(f"Resuming from index: {start_from}")

buffer = []
progress = tqdm(iter_dataset(start_from=start_from), initial=start_from, total=TOTAL_ROWS_HINT, desc="Metadata rows")

last_index = start_from - 1
for idx, row in progress:
    buffer.append(row_to_meta(idx, row))
    last_index = idx
    if len(buffer) >= CHUNK_SIZE:
        append_chunk(buffer)
        save_checkpoint(last_index)
        buffer.clear()

append_chunk(buffer)
save_checkpoint(last_index)
print(f"Done. Last index saved: {last_index}")
print(f"Metadata file: {METADATA_CSV}")

Resuming from index: 0


Metadata rows:   0%|          | 0/39200 [00:00<?, ?it/s]Some datasets params were ignored: ['homepage', 'license']. Make sure to use only valid params for the dataset builder and to have a up-to-date version of the `datasets` library.
Metadata rows: 39248it [14:55, 43.81it/s]                           

Done. Last index saved: 39247
Metadata file: /Volumes/SSanDisk/SpeechRec-German/data_tv_44khz_full/tv_44khz_full_metadata.csv





In [9]:
print("Starting audio download (resumable)...")
local_snapshot = snapshot_download(
    repo_id=REPO_ID,
    repo_type="dataset",
    local_dir=AUDIO_DIR,
    local_dir_use_symlinks=False,
    resume_download=True,
    max_workers=8,
    tqdm_class=tqdm,
)
print(f"Audio stored under: {local_snapshot}")



Starting audio download (resumable)...


Fetching 29 files: 100%|██████████| 29/29 [06:15<00:00, 12.96s/it]

Audio stored under: /Volumes/SSanDisk/SpeechRec-German/data_tv_44khz_full/audio





In [10]:
if METADATA_CSV.exists():
    df = pd.read_csv(METADATA_CSV)
    df.to_parquet(METADATA_PARQUET, index=False)
    print(f"Parquet saved: {METADATA_PARQUET} ({len(df)} rows)")
else:
    print("Metadata CSV not found yet.")

Parquet saved: /Volumes/SSanDisk/SpeechRec-German/data_tv_44khz_full/tv_44khz_full_metadata.parquet (39248 rows)
