In [1]:
!pip install --upgrade --quiet sagemaker jsonlines

In [2]:
import boto3
import os, json, pandas as pd
import csv
from sagemaker.s3 import S3Uploader, S3Downloader, s3_path_join
import boto3
import time, sagemaker
from sagemaker.huggingface.model import HuggingFaceModel
sess = sagemaker.Session()
sagemaker_session_bucket= "asrelder-data"
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::867344443757:role/service-role/AmazonSageMaker-ExecutionRole-20250917T175306
sagemaker bucket: asrelder-data
sagemaker session region: us-east-1


In [10]:
audio_files_s3_path = "s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/"
bucket_name = audio_files_s3_path.replace("s3://", "").split("/")[0]
prefix = "/".join(audio_files_s3_path.replace("s3://", "").split("/")[1:])
input_s3_path = s3_path_join("s3://", sagemaker_session_bucket, "whisper_batch/input")
output_s3_path = s3_path_join("s3://", sagemaker_session_bucket, "whisper_batch/output")
s3_client = boto3.client('s3')
print(audio_files_s3_path)
print(bucket_name)
print(prefix)
print(f"Listing audio files from {audio_files_s3_path}...")
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/
asrelder-data
common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/
Listing audio files from s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/...


In [None]:
# Manifest code --- Uncomment if we need Manifest


# Load validation CSV
# validation_df = pd.read_csv("common_voices_23_train_with_validated_votes.csv")
# print(f"Validation CSV has {len(validation_df)} entries")

# # Extract filenames from the 'path' column and create a set for fast lookup
# valid_filenames = set(validation_df['path'].values)
# print(f"Found {len(valid_filenames)} unique filenames in validation CSV")

# # Filter audio_files to only include validated files
# filtered_audio_files = []
# for audio in tqdm(audio_files, desc="Filtering files"):
#     if audio['filename'] in valid_filenames:
#         filtered_audio_files.append(audio)

# print(f"\nFiltered: {len(filtered_audio_files)} files matched (out of {len(audio_files)} total)")

# # Replace audio_files with filtered version
# audio_files = filtered_audio_files

In [None]:
# Manifest code --- Uncomment if we need Manifest

# CLIPS_PREFIX = "s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/"
# df = pd.read_csv("common_voices_23_train_with_validated_votes.csv", usecols=["path"])

# names = df["path"].astype(str).apply(lambda p: os.path.basename(p.strip()))

# manifest_path = "validated_manifest.jsonl"
# with open(manifest_path, "w") as f:
#     for n in names:
#         f.write(json.dumps({"source-ref": CLIPS_PREFIX + n}) + "\n")

# manifest_s3_uri = S3Uploader.upload(manifest_path, "s3://asrelder-data/whisper_batch/input/")
# print("Manifest:", manifest_s3_uri, "Lines:", len(names))

In [13]:
env = {
    "HF_MODEL_ID": "openai/whisper-base",
    "HF_TASK": "automatic-speech-recognition",
    "SAGEMAKER_PROGRAM": "inference.py",  # <- critical
}

model_name = f"whisper-asr-{int(time.time())}"

hf_model = HuggingFaceModel(
    env=env,
    role=role,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
    source_dir="Untitled/code",
    entry_point="inference.py",
    name=model_name,
)

# Create a new transformer from this model
output_s3_path = "s3://asrelder-data/whisper_batch/output/dementiabank"
batch = hf_model.transformer(
    instance_count=3,
    instance_type="ml.g4dn.xlarge",
    output_path=output_s3_path,
    strategy="SingleRecord",
    accept="application/json",
    #assemble_with="Line",
    max_payload=50,
    max_concurrent_transforms=1
)
print("Using model:", model_name)

Using model: whisper-asr-1760399117


In [None]:
# Build the CV manifest --- Uncomment to make manifest



# prefix = "s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/"
# bucket = "asrelder-data"
# key_prefix = "common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/"

# # list all .mp3 under the prefix (paginate-safe)
# files = []
# kwargs = dict(Bucket=bucket, Prefix=key_prefix)
# while True:
#     resp = s3.list_objects_v2(**kwargs)
#     for obj in resp.get("Contents", []):
#         if obj["Key"].endswith(".mp3"):
#             files.append(obj["Key"].replace(key_prefix, ""))
#     if resp.get("IsTruncated"):
#         kwargs["ContinuationToken"] = resp["NextContinuationToken"]
#     else:
#         break

# manifest = [{"prefix": prefix}]
# manifest.extend(files)

# # write a one-line JSON array
# local_manifest = "manifest.json"
# with jsonlines.open(local_manifest, mode="w") as w:
#     w.write(manifest)

In [4]:
# Build the Dementia manifest --- Uncomment to make manifest

# prefix = "s3://asrelder-data/dementiabank/"
# bucket = "asrelder-data"
# key_prefix = "dementiabank/"

# # Make the S3 client
# s3 = boto3.client("s3")

# # List all .mp3/.wav with a paginator
# files = []
# paginator = s3.get_paginator("list_objects_v2")
# for page in paginator.paginate(Bucket=bucket, Prefix=key_prefix):
#     for obj in page.get("Contents", []):
#         key = obj["Key"]
#         if key.endswith((".mp3", ".wav")):  # tuple, not list
#             # strip only the leading prefix
#             files.append(key[len(key_prefix):])

# manifest_as_array = [{"prefix": prefix}, *files]
# with open("dementia_manifest.json", "w") as f:
#     json.dump(manifest_as_array, f)

In [7]:
#Maniest Code --- Uncomment to check manifest

# sess = sagemaker.Session()

# bucket = "asrelder-data"
# local_manifest = "dementia_manifest.json"  

# # sanity check
# if not os.path.exists(local_manifest):
#     raise FileNotFoundError(f"Missing file: {local_manifest}")

# dementia_manifest_s3 = sess.upload_data(
#     path=local_manifest,
#     bucket=bucket,
#     key_prefix="whisper_batch/input" 
# )
# print("Manifest at:", dementia_manifest_s3)

Manifest at: s3://asrelder-data/whisper_batch/input/dementia_manifest.json


In [14]:
dementia_manifest_s3 = "s3://asrelder-data/whisper_batch/input/dementia_manifest.json"

In [15]:
# Manifest MUST be in JSON!!!!

batch.transform(
    data=dementia_manifest_s3,
    data_type="ManifestFile",           # IMPORTANT
    content_type="audio/mpeg",          # type of the  objects
    split_type="None"
)
batch.wait()
print("Output at:", output_s3_path)

INFO:sagemaker:Creating transform job with name: whisper-asr-1760399117-2025-10-13-23-45-24-419


...................................[32mThis is an experimental beta features, which allows downloading model from the Hugging Face Hub on start up. It loads the model defined in the env var `HF_MODEL_ID`[0m
[32mFor more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
[32m#015Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]#015Fetching 13 files:   8%|▊         | 1/13 [00:00<00:01,  7.62it/s]#015Fetching 13 files:  54%|█████▍    | 7/13 [00:01<00:01,  3.50it/s]#015Fetching 13 files: 100%|██████████| 13/13 [00:01<00:00,  6.66it/s][0m
[34mThis is an experimental beta features, which allows downloading model from the Hugging Face Hub on start up. It loads the model defined in the env var `HF_MODEL_ID`[0m
[34mFor more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.
[34m#015Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]#015Fe