In [2]:

import os
import shutil
from snowflake.snowpark import Session


In [265]:

account='' # YOUR_ACCOUNT
user='' # YOUR_USER
password='' # YOUR_PASSWORD
role='' #YOUR_ROLE

database='' #YOUR_DB
schema='' #YOUR_SCHEMA
stage='data_files' #YOUR_STAGE
stage_files_path='audio2text/container-files' #data path 

warehouse='' #YOUR WH
image_registry='TEST_REPOSITORY' #name of the image registry that will be created
image_name='audio2text:01' # name of the image

external_access_integration='all_eai' # EAI that is used to retrieve the model
num_replicas = 1 # service number of replicas
job_name='audio2text_v2' # job name
output_table='audio2text_output_table' #table to write results

compute_pool_name='CP_GPU_01' 
compute_pool_instance_family='CPU_X64_M'
compute_pool_instances=1


connection_parameters = {
    "account": account,
    "user": user,
    "password": password,
    "warehouse": warehouse,
    "database": database,
    "schema": schema,
    "role": role,
    "host": host,
    "client_session_keep_alive": True,
}

session = Session.builder.configs(connection_parameters).create()


In [None]:


import tempfile
import os

tempdir = tempfile.TemporaryDirectory()


os.environ['DATA_TARGET_DIR'] = tempdir.name

!wget -P $DATA_TARGET_DIR URL https://us.openslr.org/resources/12/dev-clean.tar.gz

!tar -xzf $DATA_TARGET_DIR/dev-clean.tar.gz -C $DATA_TARGET_DIR


create_data_stage_sql = f"create stage if not exists {stage} ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE') DIRECTORY = (ENABLE = TRUE);"
session.sql(create_data_stage_sql).collect()


session.file.put(f"{tempdir.name}/LibriSpeech/dev-clean/*/*/*", f"@{stage}/LibriSpeech", auto_compress=False, overwrite=True, parallel=99)

tempdir.cleanup()



In [267]:

create_image_repo_sql = f"CREATE IMAGE REPOSITORY IF NOT EXISTS {image_registry}"
session.sql(create_image_repo_sql).collect()

get_image_repo_sql = f"show image repositories like '{image_registry}';"
repository_url = session.sql(get_image_repo_sql).collect()[0]['repository_url']
print(repository_url)


preprod9-aivanoutest02.awsuswest2preprod9.registry-dev.snowflakecomputing.com/aivanoudb/public/embeddings_repository


In [None]:

os.environ['SPCS_USERNAME']=user
os.environ['SPCS_PASSWORD']=password
os.environ['SPCS_IMAGE_REPO']=repository_url
os.environ['SPCS_IMAGE_NAME']=image_name

!docker login $SPCS_IMAGE_REPO -u $SPCS_USERNAME -p $SPCS_PASSWORD

!docker build --platform linux/amd64 -t $SPCS_IMAGE_REPO/$SPCS_IMAGE_NAME -f ./Dockerfile.benchmark ./

!docker push $SPCS_IMAGE_REPO/$SPCS_IMAGE_NAME


In [269]:

create_compute_pool_sql = f"""
create compute pool if not exists {compute_pool_name}
  min_nodes={compute_pool_instances}
  max_nodes={compute_pool_instances}
  instance_family={compute_pool_instance_family};
"""

print(session.sql(create_compute_pool_sql).collect())



[Row(status='CP_GPU_01 already exists, statement succeeded.')]


In [334]:

job_name="NEMO_MODEL"

# run canary model
print(session.sql(f'DROP SERVICE IF EXISTS {job_name}').collect())

create_async_job_sql = f"""
EXECUTE JOB SERVICE
IN COMPUTE POOL {compute_pool_name} 
NAME = {job_name}
ASYNC = True
REPLICAS = {num_replicas}
QUERY_WAREHOUSE = {warehouse}
EXTERNAL_ACCESS_INTEGRATIONS = ({external_access_integration})
FROM SPECIFICATION $$
    spec:
      container:
      - name: main
        image: /{database}/{schema}/{image_registry}/{image_name}
        command:
        - python
        args:
        - -u
        - ./main_benchmark.py
        - --output-table=audio2text02
        - --model-name=nvidia/canary-1b
        - --model-type=nemo-canary
        - --dataset-type=libri
        - --batch-size=8
        env:
            SNOWFLAKE_QUERY_WAREHOUSE: LARGEWH
        volumeMounts:
          - name: data-files
            mountPath: /data

        resources:
              requests:
                nvidia.com/gpu: 1
                memory: "10Gi"
              limits:
                nvidia.com/gpu: 1
                memory: "10Gi"
      volume:
        - name: data-files
          source: "@{stage}"
          uid: 1000
          gid: 1000

$$
"""

print(session.sql(create_async_job_sql).collect())



[Row(status='AUDIO2TEXT_V2 successfully dropped.')]
[Row(status="Started Snowpark Container Services Job 'AUDIO2TEXT_V2'.")]


In [None]:

job_name="WHISPER_MODEL"
# run whisper model
print(session.sql(f'DROP SERVICE IF EXISTS {job_name}').collect())

create_async_job_sql = f"""
EXECUTE JOB SERVICE
IN COMPUTE POOL {compute_pool_name} 
NAME = {job_name}
ASYNC = True
REPLICAS = {num_replicas}
QUERY_WAREHOUSE = {warehouse}
EXTERNAL_ACCESS_INTEGRATIONS = ({external_access_integration})
FROM SPECIFICATION $$
    spec:
      container:
      - name: main
        image: /{database}/{schema}/{image_registry}/{image_name}
        command:
        - python
        args:
        - -u
        - ./main_benchmark.py
        - --output-table=audio2text02
        - --model-name=openai/whisper-tiny.en
        - --model-type=whisper
        - --dataset-type=libri
        - --batch-size=8
        env:
            SNOWFLAKE_QUERY_WAREHOUSE: LARGEWH
        volumeMounts:
          - name: data-files
            mountPath: /data

        resources:
              requests:
                nvidia.com/gpu: 1
                memory: "10Gi"
              limits:
                nvidia.com/gpu: 1
                memory: "10Gi"
      volume:
        - name: data-files
          source: "@{stage}"
          uid: 1000
          gid: 1000

$$
"""

print(session.sql(create_async_job_sql).collect())



In [None]:

print(session.sql(f'DESC SERVICE {job_name}').collect())



In [None]:

logs = session.sql(f"CALL SYSTEM$GET_SERVICE_LOGS('{job_name}', 0, 'main')").collect()
for line in logs[0][0].split('\n'):
    print(line)


In [None]:

sql_logs_query = """
select VALUE from aivanoudb.PUBLIC.snowservice_logs
where true
and RESOURCE_ATTRIBUTES:"snow.service.name" = 'AUDIO2TEXT_V2'
and RESOURCE_ATTRIBUTES:"snow.service.container.instance" = '0'
order by timestamp desc
limit 200;
"""

logs = session.sql(sql_logs_query).collect()

logs

