In [None]:
!pip install --upgrade --quiet sagemaker jsonlines

In [None]:
import boto3
import os, json, pandas as pd
import csv
from sagemaker.s3 import S3Uploader, S3Downloader, s3_path_join
import boto3
import time, sagemaker
from sagemaker.huggingface.model import HuggingFaceModel
sess = sagemaker.Session()
sagemaker_session_bucket= "asrelder-data"
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
audio_files_s3_path = "s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/"
bucket_name = audio_files_s3_path.replace("s3://", "").split("/")[0]
prefix = "/".join(audio_files_s3_path.replace("s3://", "").split("/")[1:])
input_s3_path = s3_path_join("s3://", sagemaker_session_bucket, "whisper_batch/input")
output_s3_path = s3_path_join("s3://", sagemaker_session_bucket, "whisper_batch/output")
s3_client = boto3.client('s3')
print(audio_files_s3_path)
print(bucket_name)
print(prefix)
print(f"Listing audio files from {audio_files_s3_path}...")
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

In [None]:
# Load validation CSV
validation_df = pd.read_csv("common_voices_23_train_with_validated_votes.csv")
print(f"Validation CSV has {len(validation_df)} entries")

# Extract filenames from the 'path' column and create a set for fast lookup
valid_filenames = set(validation_df['path'].values)
print(f"Found {len(valid_filenames)} unique filenames in validation CSV")

# Filter audio_files to only include validated files
filtered_audio_files = []
for audio in tqdm(audio_files, desc="Filtering files"):
    if audio['filename'] in valid_filenames:
        filtered_audio_files.append(audio)

print(f"\nFiltered: {len(filtered_audio_files)} files matched (out of {len(audio_files)} total)")

# Replace audio_files with filtered version
audio_files = filtered_audio_files

In [None]:
CLIPS_PREFIX = "s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/"
df = pd.read_csv("common_voices_23_train_with_validated_votes.csv", usecols=["path"])

# ensure basename (if CSV already includes 'clips/...' this still works)
names = df["path"].astype(str).apply(lambda p: os.path.basename(p.strip()))

manifest_path = "validated_manifest.jsonl"
with open(manifest_path, "w") as f:
    for n in names:
        f.write(json.dumps({"source-ref": CLIPS_PREFIX + n}) + "\n")

manifest_s3_uri = S3Uploader.upload(manifest_path, "s3://asrelder-data/whisper_batch/input/")
print("Manifest:", manifest_s3_uri, "Lines:", len(names))

In [None]:
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

env = {
    "HF_MODEL_ID": "openai/whisper-base",
    "HF_TASK": "automatic-speech-recognition",
    "SAGEMAKER_PROGRAM": "inference.py",  # <- critical
}

model_name = f"whisper-asr-{int(time.time())}"

hf_model = HuggingFaceModel(
    env=env,
    role=role,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
    source_dir="Untitled/code",
    entry_point="inference.py",
    name=model_name,
)

# Create a new transformer from this model
output_s3_path = "s3://asrelder-data/whisper_batch/output"
batch = hf_model.transformer(
    instance_count=4,
    instance_type="ml.g4dn.xlarge",
    output_path=output_s3_path,
    strategy="SingleRecord",
    accept="application/json",
    #assemble_with="Line",
    max_payload=50,
    max_concurrent_transforms=1
)
print("Using model:", model_name)

In [None]:
# 1) Build the manifest (prefix + relative filenames)
prefix = "s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/"
bucket = "asrelder-data"
key_prefix = "common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/"

# list all .mp3 under the prefix (paginate-safe)
files = []
kwargs = dict(Bucket=bucket, Prefix=key_prefix)
while True:
    resp = s3.list_objects_v2(**kwargs)
    for obj in resp.get("Contents", []):
        if obj["Key"].endswith(".mp3"):
            files.append(obj["Key"].replace(key_prefix, ""))
    if resp.get("IsTruncated"):
        kwargs["ContinuationToken"] = resp["NextContinuationToken"]
    else:
        break

manifest = [{"prefix": prefix}]
manifest.extend(files)

# write a one-line JSON array (as AWS examples do)
local_manifest = "manifest.json"
with jsonlines.open(local_manifest, mode="w") as w:
    w.write(manifest)

In [None]:
manifest_s3 = sess.upload_data(local_manifest, bucket=bucket, key_prefix="whisper_batch/input")
print("Manifest at:", manifest_s3)

In [None]:
# single_mp3 = "s3://asrelder-data/common_voice/23/cv-corpus-23.0-2025-09-05/en/clips/common_voice_en_100038.mp3"
# batch.transform(
#     data=single_mp3,
#     data_type="S3Prefix",
#     content_type="audio/mpeg",
#     split_type="None" )
# batch.wait()


# batch.transform(
#     data=manifest_s3_uri,
#     data_type="ManifestFile",
#     content_type="audio/mpeg",
#     split_type="None"
# )

batch.transform(
    data=manifest_s3,
    data_type="ManifestFile",           # IMPORTANT
    content_type="audio/mpeg",          # type of the  objects
    split_type="None"
)
batch.wait()
print("Output at:", output_s3_path)