# Install dependencies and prepare environment

In [2]:
!pip install -U sagemaker boto3

Collecting sagemaker
  Downloading sagemaker-2.244.2-py3-none-any.whl.metadata (17 kB)
Collecting boto3
  Downloading boto3-1.38.23-py3-none-any.whl.metadata (6.6 kB)
Collecting graphene<4,>=3 (from sagemaker)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting botocore<1.39.0,>=1.38.23 (from boto3)
  Downloading botocore-1.38.23-py3-none-any.whl.metadata (5.7 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3)
  Downloading s3transfer-0.13.0-py3-none-any.whl.metadata (1.7 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4,>=3->sagemaker)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4,>=3->sagemaker)
  Downloading graphql_relay-3.2.0-py3-none-any.whl.metadata (12 kB)
Downloading sagemaker-2.244.2-py3-none-any.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.38.23-py3-none-any.

In [37]:
import boto3
import sagemaker
import time

sm = boto3.Session().client("sagemaker")
sess = sagemaker.session.Session()
role = sagemaker.get_execution_role()
region = sess._region_name
bucket = sess.default_bucket()
model_path_prefix = "nvidia-parakeet"

# Get sagemaker DLC
image_uri = sagemaker.image_uris.retrieve(
    framework="pytorch",
    region=region,
    py_version="py312",
    image_scope="inference",
    version="2.6.0",
    instance_type="ml.g5.2xlarge",
)

print("Sagemaker execution role:", role)
print("Deployment region:", region)
print("Sagemaker image_uri:", image_uri)
print("Model upload S3 path:", f"s3://{bucket}/{model_path_prefix}/")

# Set sagemaker env
model_name = "nvidia-parakeet-model"
endpoint_config_name = "nvidia-parakeet-endpoint-config"
endpoint_name = "nvidia-parakeet-endpoint"
print("Sagemaker model name:", model_name)
print("Sagemaker endpoint_config_name name:", endpoint_config_name)
print("Sagemaker endpoint_name name:", endpoint_name)

Sagemaker execution role: arn:aws:iam::930179054915:role/service-role/SageMaker-array-us-west-2
Deployment region: us-west-2
Sagemaker image_uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference:2.6.0-gpu-py312
Model upload S3 path: s3://sagemaker-us-west-2-930179054915/nvidia-parakeet/
Sagemaker model name: nvidia-parakeet-model
Sagemaker endpoint_config_name name: nvidia-parakeet-endpoint-config
Sagemaker endpoint_name name: nvidia-parakeet-endpoint


# Upload model inference code

In [26]:
!rm -rf model.tar.
!tar -zcvf model.tar.gz ./code --exclude='*.ipynb' --exclude='*/.ipynb_checkpoints'

./code/
./code/requirements.txt
./code/inference.py


In [27]:
model_data = sess.upload_data("model.tar.gz", bucket, model_path_prefix)
print(f"Code tar ball uploaded to ---> {model_data}")

Code tar ball uploaded to ---> s3://sagemaker-us-west-2-930179054915/nvidia-parakeet/model.tar.gz


# Deploy sagemaker resources

In [31]:
def create_model():
    resp = sm.create_model(
        ModelName=model_name,
        ExecutionRoleArn=role,
        Containers=[{"Image": image_uri, "ModelDataUrl": model_data}]
    )
    print(f"Created model: {resp}")

create_model()

Created model: {'ModelArn': 'arn:aws:sagemaker:us-west-2:930179054915:model/nvidia-parakeet-model', 'ResponseMetadata': {'RequestId': 'e5711207-f2c6-42f3-8541-28762a986ed4', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e5711207-f2c6-42f3-8541-28762a986ed4', 'content-type': 'application/x-amz-json-1.1', 'content-length': '83', 'date': 'Mon, 26 May 2025 08:06:53 GMT'}, 'RetryAttempts': 0}}


In [33]:
def create_endpoint_config():
    resp = sm.create_endpoint_config(
        EndpointConfigName=endpoint_config_name,
        ProductionVariants=[
            {
                "VariantName": "AllTraffic",
                "ModelName": model_name,
                "InstanceType": "ml.g5.2xlarge",
                "InitialInstanceCount": 1,
                "ContainerStartupHealthCheckTimeoutInSeconds": 300
            }
        ],
    )
    print(f"Created Endpoint Config: {resp}")

create_endpoint_config()

Created Endpoint Config: {'EndpointConfigArn': 'arn:aws:sagemaker:us-west-2:930179054915:endpoint-config/nvidia-parakeet-endpoint-config', 'ResponseMetadata': {'RequestId': 'fed8ca7f-cd39-4273-b860-9245ff304811', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'fed8ca7f-cd39-4273-b860-9245ff304811', 'content-type': 'application/x-amz-json-1.1', 'content-length': '112', 'date': 'Mon, 26 May 2025 08:07:17 GMT'}, 'RetryAttempts': 0}}


In [35]:
def create_endpoint():
    resp = sm.create_endpoint(EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name)
    print(f"\nCreated Endpoint: {resp}")

create_endpoint()


Created Endpoint: {'EndpointArn': 'arn:aws:sagemaker:us-west-2:930179054915:endpoint/nvidia-parakeet-endpoint', 'ResponseMetadata': {'RequestId': '4d573c7b-d3a5-4289-ad52-b42b2513d3d9', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '4d573c7b-d3a5-4289-ad52-b42b2513d3d9', 'content-type': 'application/x-amz-json-1.1', 'content-length': '92', 'date': 'Mon, 26 May 2025 08:18:28 GMT'}, 'RetryAttempts': 0}}
Waiting for endpoint in service


In [38]:
print("Waiting for endpoint in service")
while True:
    details = sm.describe_endpoint(EndpointName=endpoint_name)
    status = details["EndpointStatus"]
    if status in ["InService", "Failed"]:
        print(f"\nDone! Status: {status}")
        break
    print(".", end="", flush=True)
    time.sleep(30)

Waiting for endpoint in service
..............
Done! Status: InService


# Invoke sagemaker endpoint

In [39]:
!pip install -U kaldiio

Collecting kaldiio
  Downloading kaldiio-2.18.1-py3-none-any.whl.metadata (13 kB)
Downloading kaldiio-2.18.1-py3-none-any.whl (29 kB)
Installing collected packages: kaldiio
Successfully installed kaldiio-2.18.1


In [50]:
import json
import boto3
import kaldiio
import time
import sys

def prepare_audio_data(audio_path):
    sample_rate, wav_np = kaldiio.load_mat(audio_path)
    audio_bytes = wav_np.tobytes()
    print(f"Successfully loaded audio file: {audio_path}, sample rate: {sample_rate}")
    return audio_bytes

def invoke_endpoint(audio_bytes, endpoint_name):
    runtime_client = boto3.client('sagemaker-runtime')
    print(f"Invoking endpoint: {endpoint_name}")

    # Call the endpoint
    response = runtime_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/octet-stream',
        Body=audio_bytes
    )

    # Parse response
    result = json.loads(response['Body'].read().decode())
    print("Endpoint invocation successful")
    return result

def main():
    # Get audio file path from command line arguments if provided
    audio_file = "2086-149220-0033.wav"

    print(f"Starting to process audio file: {audio_file}")
    audio_bytes = prepare_audio_data(audio_file)
        
    start_time = time.time()
    result = invoke_endpoint(audio_bytes,endpoint_name)
    end_time = time.time()
    time_used = round((end_time - start_time) * 1000)
    print(f"Inference time: {time_used}ms")
        
    if result:
        print("Model response:")
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print("Invocation failed, no valid response received")
        sys.exit(1)

if __name__ == "__main__":
    main()


Starting to process audio file: 2086-149220-0033.wav
Successfully loaded audio file: 2086-149220-0033.wav, sample rate: 16000
Invoking endpoint: nvidia-parakeet-endpoint
Endpoint invocation successful
Inference time: 134ms
Model response:
{
  "result": "Well, I don't wish to see it any more, observed Phebe, turning away her eyes. It is certainly very like the old portrait.",
  "transcribe_time_ms": 58.773040771484375
}


# Delete sagemaker resources

In [52]:
sm.delete_endpoint(EndpointName=endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
sm.delete_model(ModelName=model_name)

{'ResponseMetadata': {'RequestId': '7715751b-7bfd-4cca-b0f6-e55f1312c1e0',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '7715751b-7bfd-4cca-b0f6-e55f1312c1e0',
   'content-type': 'application/x-amz-json-1.1',
   'date': 'Mon, 26 May 2025 08:38:35 GMT',
   'content-length': '0'},
  'RetryAttempts': 0}}