# Processing Job

In this notebook we will see how to run a pre-proccessing job for the speech commands datasets. The proccessing data set will take the the data from specific S3 bucket, then do the specified pre-proccessing, aftert that the prepoccessed data is saved again in the S3 bueckt

In [2]:
import boto3
import sagemaker
from sagemaker import get_execution_role

In [3]:
region = boto3.session.Session().region_name

role = get_execution_role()


# Pre-processing Script

In [5]:
%%writefile utils/preprocessing.py

import argparse
import os
import warnings

import pandas as pd
import pathlib
import numpy as np
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import models
from IPython import display


# Set the seed value for experiment reproducibility.
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
AUTOTUNE = tf.data.AUTOTUNE

input_data_paths = "/opt/ml/processing/input/data"
commands = np.array(tf.io.gfile.listdir(str(input_data_paths)))
commands = commands[commands != 'README.md']
print('Commands:', commands)

def decode_audio(audio_binary):
  # Decode WAV-encoded audio files to `float32` tensors, normalized
  # to the [-1.0, 1.0] range. Return `float32` audio and a sample rate.
  audio, _ = tf.audio.decode_wav(contents=audio_binary)
  # Since all the data is single channel (mono), drop the `channels`
  # axis from the array.
  return tf.squeeze(audio, axis=-1)

def get_label(file_path):
  parts = tf.strings.split(
      input=file_path,
      sep=os.path.sep)
  # Note: You'll use indexing here instead of tuple unpacking to enable this
  # to work in a TensorFlow graph.
  return parts[-2]

def get_waveform_and_label(file_path):
  label = get_label(file_path)
  audio_binary = tf.io.read_file(file_path)
  waveform = decode_audio(audio_binary)
  return waveform, label

def get_spectrogram(waveform):
  # Zero-padding for an audio waveform with less than 16,000 samples.
  input_len = 16000
  waveform = waveform[:input_len]
  zero_padding = tf.zeros(
      [16000] - tf.shape(waveform),
      dtype=tf.float32)
  # Cast the waveform tensors' dtype to float32.
  waveform = tf.cast(waveform, dtype=tf.float32)
  # Concatenate the waveform with `zero_padding`, which ensures all audio
  # clips are of the same length.
  equal_length = tf.concat([waveform, zero_padding], 0)
  # Convert the waveform to a spectrogram via a STFT.
  spectrogram = tf.signal.stft(
      equal_length, frame_length=255, frame_step=325, fft_length =78 )
  # Obtain the magnitude of the STFT.
  spectrogram = tf.abs(spectrogram)
  # Add a `channels` dimension, so that the spectrogram can be used
  # as image-like input data with convolution layers (which expect
  # shape (`batch_size`, `height`, `width`, `channels`).
  spectrogram = spectrogram[..., tf.newaxis]
  return spectrogram

def get_spectrogram_and_label_id(audio, label):
  spectrogram = get_spectrogram(audio)
  label_id = tf.argmax(label == commands)
  return spectrogram, label_id

def preprocess_dataset(files):
  files_ds = tf.data.Dataset.from_tensor_slices(files)
  output_ds = files_ds.map(
      map_func=get_waveform_and_label,
      num_parallel_calls=AUTOTUNE)
  output_ds = output_ds.map(
      map_func=get_spectrogram_and_label_id,
      num_parallel_calls=AUTOTUNE)
  return output_ds

if __name__ == "__main__":
    
    parser = argparse.ArgumentParser()
    parser.add_argument("--train-ratio", type=float, default=0.8)
    args, _ = parser.parse_known_args()
    
    print("Received arguments {}".format(args))
    
    
    filenames = tf.io.gfile.glob(str(input_data_paths) + '/*/*')
    filenames = tf.random.shuffle(filenames)
    num_samples = len(filenames)
    print('Number of total examples:', num_samples)
#     print('Number of examples per label:',
#           len(tf.io.gfile.listdir(str(input_data_paths/commands[1]))))
    print('Example file tensor:', filenames[0])
    
    # splitting the data set depending on the training ratio
    
    train_ratio = args.train_ratio
    val_test_ratio = (1 - train_ratio)/2
    
    train_files = filenames[:int(train_ratio*len(filenames))]
    val_files = filenames[int(train_ratio*len(filenames)): int(train_ratio*len(filenames)) + int(val_test_ratio*len(filenames))]
    test_files = filenames[-int(val_test_ratio*len(filenames)):]

    print('Training set size', len(train_files))
    print('Validation set size', len(val_files))
    print('Test set size', len(test_files))
    
    # Converting audio to spectrograms
    train_ds = preprocess_dataset(train_files)
    val_ds = preprocess_dataset(val_files)
    test_ds = preprocess_dataset(test_files)
    

    train_features_output_path = "/opt/ml/processing/train"
    val_features_output_path = "/opt/ml/processing/val"
    test_features_output_path = "/opt/ml/processing/test"
    labels_output_path = "/opt/ml/processing/commands/commands"

    
    print("Saving train spectrogram to {}".format(train_features_output_path))
    tf.data.experimental.save(train_ds, train_features_output_path)
   
    print("Saving val spectrogram to {}".format(val_features_output_path))
    tf.data.experimental.save(val_ds, val_features_output_path)
    
    print("Saving test spectrogram to {}".format(test_features_output_path))
    tf.data.experimental.save(test_ds, test_features_output_path)
    
    print("Saving labels to {}".format(labels_output_path))
    np.save(labels_output_path, commands)
    
    
    d = np.load(labels_output_path + ".npy")
    print(commands == d)

    
    
    
    
    




Overwriting utils/preprocessing.py


# Running training job for mini-data-set
# Before running the job let's see where is the data that will be processed 

In [4]:
! aws s3 ls s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/yes/ --recursive --summarize | grep "Total Objects:"
! aws s3 ls s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/no/ --recursive --summarize | grep "Total Objects:"

Total Objects: 1000
Total Objects: 1000


### Now we will run preprocessing job to run the previous script to the mini Speech Commands dataset

In [6]:
from sagemaker.processing import ScriptProcessor

script_processor = ScriptProcessor(
    command=["python3"],
    image_uri="763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.6.2-cpu-py38-ubuntu20.04",
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
)

In [7]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
script_processor.run(
    code="utils/preprocessing.py",
    inputs=[ProcessingInput(source="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/yes/", destination="/opt/ml/processing/input/data/yes/"),
            ProcessingInput(source="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/no/", destination="/opt/ml/processing/input/data/no/")],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train", destination="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/pre-processed/train/"),
        ProcessingOutput(output_name="val_data", source="/opt/ml/processing/val", destination="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/pre-processed/val/"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test", destination="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/pre-processed/test/"),
        ProcessingOutput(output_name="commands", source="/opt/ml/processing/commands", destination="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/pre-processed/commands"),
    ],
    arguments=["--train-ratio", "0.8"],
)
script_processor_job_description = script_processor.jobs[-1].describe()
print(script_processor_job_description)


Job Name:  tensorflow-training-2022-06-02-22-48-19-611
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/yes/', 'LocalPath': '/opt/ml/processing/input/data/yes/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/mini-speech-commands/no/', 'LocalPath': '/opt/ml/processing/input/data/no/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-062044820001/tensorflow-training-2022-06-02-22-48-19-611/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataD

### Now we will run preprocessing job to run the previous script to the Original Speech Commands dataset

In [8]:
! aws s3 ls s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/yes/ --recursive --summarize | grep "Total Objects:"
! aws s3 ls s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/no/ --recursive --summarize | grep "Total Objects:"

Total Objects: 4044
Total Objects: 3940


### Now we will run preprocessing job to run the previous script with the data in S3

In [9]:
from sagemaker.processing import ScriptProcessor

script_processor_original_ds = ScriptProcessor(
    command=["python3"],
    image_uri="763104351884.dkr.ecr.us-east-1.amazonaws.com/tensorflow-training:2.6.2-cpu-py38-ubuntu20.04",
    role=role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
)

In [10]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
script_processor_original_ds.run(
    code="utils/preprocessing.py",
    inputs=[ProcessingInput(source="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/yes/", destination="/opt/ml/processing/input/data/yes/"),
            ProcessingInput(source="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/no/", destination="/opt/ml/processing/input/data/no/")],
    outputs=[
        ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train", destination="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/pre-processed/train/"),
        ProcessingOutput(output_name="val_data", source="/opt/ml/processing/val", destination="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/pre-processed/val/"),
        ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test", destination="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/pre-processed/test/"),
        ProcessingOutput(output_name="commands", source="/opt/ml/processing/commands", destination="s3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/pre-processed/commands"),
    ],
    arguments=["--train-ratio", "0.8"],
)
script_processor_job_description = script_processor_original_ds.jobs[-1].describe()
print(script_processor_job_description)


Job Name:  tensorflow-training-2022-06-02-22-53-36-432
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/yes/', 'LocalPath': '/opt/ml/processing/input/data/yes/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-studio-062044820001-7qbctb3w94p/Datasets/speech-commands/no/', 'LocalPath': '/opt/ml/processing/input/data/no/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-062044820001/tensorflow-training-2022-06-02-22-53-36-432/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributio