# Data Loader — TORGO Dataset

This notebook loads and inspects the **TORGO** database (speech from people with cerebral palsy or amyotrophic lateral sclerosis). It is used to prepare data for ASR training and evaluation in the ADI/O project.

In [None]:
import argparse
import json
from pathlib import Path
from collections import defaultdict

from datasets import load_dataset, Audio, DatasetDict, ClassLabel

TRAIN_RATIO = 0.8
TEST_RATIO = 0.2

In [31]:
def load_torgo(sampling_rate: int = 16000):
    print("Loading Torgo dataset...")
    dataset = load_dataset("abnerh/TORGO-database")
    dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
    print(f"Loaded {sum(len(split) for split in dataset.values())} total samples.")

    return dataset

dataset = load_torgo()

Loading Torgo dataset...
Loaded 16552 total samples.


In [32]:
def get_dataset_info(dataset) -> dict:
    info = {
        "total_samples": 0,
        "by_status": defaultdict(int),
        "by_gender": defaultdict(int),
        "by_status_gender": defaultdict(int),
        "durations": defaultdict(list),
    }

    for split_name in dataset:
        subset = dataset[split_name].select_columns(
            ["transcription", "speech_status", "gender", "duration"]
        )
        for sample in subset:
            info["total_samples"] += 1
            status = sample.get("speech_status", "unknown")
            gender = sample.get("gender", "unknown")
            duration = sample.get("duration", 0)

            info["by_status"][status] += 1
            info["by_gender"][gender] += 1
            info["by_status_gender"][f"{status}_{gender}"] += 1
            info["durations"][status].append(duration)

    return info

get_dataset_info(dataset)

{'total_samples': 16552,
 'by_status': defaultdict(int, {'healthy': 10978, 'dysarthria': 5574}),
 'by_gender': defaultdict(int, {'female': 6407, 'male': 10145}),
 'by_status_gender': defaultdict(int,
             {'healthy_female': 4411,
              'healthy_male': 6567,
              'dysarthria_male': 3578,
              'dysarthria_female': 1996}),
 'durations': defaultdict(list,
             {'healthy': [3.3,
               3.45,
               7.2,
               3.6,
               3.45,
               4.5,
               2.4,
               3.6,
               3.0,
               3.9,
               3.3,
               3.3,
               3.45,
               3.0,
               3.6,
               3.15,
               5.25,
               3.3,
               3.45,
               3.3,
               6.75,
               3.3,
               3.3,
               6.45,
               3.45,
               6.0,
               3.3,
               3.6,
               3.0,
            

In [38]:
def create_splits(dataset, seed: int = 42) -> DatasetDict:
    full = dataset["train"] if "train" in dataset else dataset[list(dataset.keys())[0]]

    unique_statuses = sorted(set(full["speech_status"]))
    full = full.cast_column("speech_status", ClassLabel(names=unique_statuses))

    split1 = full.train_test_split(
        test_size=TEST_RATIO,
        seed=seed,
        stratify_by_column="speech_status",
    )

    return DatasetDict({
        "train": split1["train"],
        "test": split1["test"],
    })

dataset_splits = create_splits(dataset)
print(dataset_splits)

Casting the dataset: 100%|██████████| 16552/16552 [00:04<00:00, 3600.37 examples/s]


DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'speech_status', 'gender', 'duration'],
        num_rows: 13240
    })
    val: Dataset({
        features: ['audio', 'transcription', 'speech_status', 'gender', 'duration'],
        num_rows: 1656
    })
    test: Dataset({
        features: ['audio', 'transcription', 'speech_status', 'gender', 'duration'],
        num_rows: 1656
    })
})


In [40]:
def save_dataset(dataset: DatasetDict, output_dir: Path):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    split_stats = {}
    for split_name in dataset:
        subset = dataset[split_name].select_columns(["speech_status"])
        status_counts = defaultdict(int)
        for sample in subset:
            status_counts[sample["speech_status"]] += 1
        split_stats[split_name] = {
            "total": len(dataset[split_name]),
            "by_status": dict(status_counts),
        }

    splits_path = output_dir / "splits.json"
    with open(splits_path, "w") as f:
        json.dump({
            "ratios": {"train": TRAIN_RATIO, "val": VAL_RATIO, "test": TEST_RATIO},
            "stats": split_stats,
        }, f, indent=2)
    print(f"Split info saved to {splits_path}")

    dataset_path = output_dir / "torgo_dataset"
    dataset.save_to_disk(str(dataset_path))
    print(f"Dataset saved to {dataset_path}")

    return split_stats

# Saves to audio/torgo_dataset/ (run notebook from project root)
# Or use "torgo_dataset" to save in current working directory
save_dataset(dataset_splits, Path("torgo_dataset"))

Split info saved to torgo_dataset/splits.json


Saving the dataset (3/3 shards): 100%|██████████| 13240/13240 [00:07<00:00, 1753.77 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1656/1656 [00:00<00:00, 3738.76 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1656/1656 [00:00<00:00, 4230.91 examples/s]

Dataset saved to torgo_dataset/torgo_dataset





{'train': {'total': 13240, 'by_status': {1: 8782, 0: 4458}},
 'val': {'total': 1656, 'by_status': {0: 558, 1: 1098}},
 'test': {'total': 1656, 'by_status': {1: 1098, 0: 558}}}