## OpenSMILE data preprocessing job
### Bulding ML pIpelines useing sagemaker processing SDK
###  Orchestrating ML preprocessing jobs from sagemaker
### .

In [40]:
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import time
import datetime
import boto3
import sagemaker
from sagemaker import get_execution_role

from sagemaker.processing import ScriptProcessor, ProcessingInput

Populating the interactive namespace from numpy and matplotlib


In [19]:
# Coustomized docker container hosted in aws ecr 
openface_repository_uri = '166087373671.dkr.ecr.ap-south-1.amazonaws.com/vkonda18/opensmile:latest'

In [20]:
#input and output data location
bucket = 'openface-preproces'
input_prefix = 'input_data'
preprocessed_prefix = 'processed_data'

In [21]:
region  = boto3.session.Session().region_name

role = get_execution_role()

cmd = ["python3","-v"]

sklearn_processor = ScriptProcessor( base_job_name='opensmile-preprocessor',
                                     image_uri= openface_repository_uri,
                                     command=cmd,
                                     role=role,
                                     instance_type='ml.m5.large',
                                     max_runtime_in_seconds=600,
                                     instance_count=1,
                                     env={'mode': 'python3'})

In [31]:
# running processing job
sklearn_processor.run(code='preprocess.py',
                     arguments=['s3_input_bucket', bucket,
                               's3_input_key_prefix', input_prefix,
                               's3_output_bucket', bucket,
                               's3_output_key_prefix', preprocessed_prefix],
                    logs=False)


Job Name:  opensmile-preprocessor-2020-02-15-15-48-59-724
Inputs:  [{'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-ap-south-1-166087373671/opensmile-preprocessor-2020-02-15-15-48-59-724/input/code/preprocess.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  []
.....................................!

In [32]:
client = boto3.client('sagemaker')
response = client.describe_processing_job(
    ProcessingJobName='opensmile-preprocessor-2020-02-15-15-48-59-724'
)
response

{'ProcessingInputs': [{'InputName': 'code',
   'S3Input': {'S3Uri': 's3://sagemaker-ap-south-1-166087373671/opensmile-preprocessor-2020-02-15-15-48-59-724/input/code/preprocess.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}}],
 'ProcessingJobName': 'opensmile-preprocessor-2020-02-15-15-48-59-724',
 'ProcessingResources': {'ClusterConfig': {'InstanceCount': 1,
   'InstanceType': 'ml.m5.large',
   'VolumeSizeInGB': 30}},
 'StoppingCondition': {'MaxRuntimeInSeconds': 600},
 'AppSpecification': {'ImageUri': '166087373671.dkr.ecr.ap-south-1.amazonaws.com/vkonda18/opensmile:latest',
  'ContainerEntrypoint': ['python3',
   '-v',
   '/opt/ml/processing/input/code/preprocess.py'],
  'ContainerArguments': ['s3_input_bucket',
   'openface-preproces',
   's3_input_key_prefix',
   'input_data',
   's3_output_bucket',
   'openface-preproces',
   's3_outpu

In [46]:
response['ProcessingEndTime']

datetime.datetime(2020, 2, 15, 15, 52, tzinfo=tzlocal())

In [47]:
response['ProcessingStartTime']

datetime.datetime(2020, 2, 15, 15, 50, 8, tzinfo=tzlocal())

In [39]:
BUCKET_NAME = 'opensmile-preprocess' # replace with your bucket name
KEY = 'processed_data/media-inter.energy.csv' # replace with your object key

s3 = boto3.resource('s3')

s3.Bucket(BUCKET_NAME).download_file(KEY, '/home/ec2-user/SageMaker/media-inter.energy.csv')

In [41]:
opensmile_1 = pd.read_csv('opensmile.energy.csv')
opensmile_2 = pd.read_csv('media-inter.energy.csv')

In [45]:
opensmile_1.count()

frameIndex;frameTime;pcm_LOGenergy    202
dtype: int64

In [44]:
opensmile_2.head(10)

Unnamed: 0,frameIndex;frameTime;pcm_LOGenergy
0,0;0.000000;-1.318377e+01
1,1;0.010000;-1.302121e+01
2,2;0.020000;-1.312088e+01
3,3;0.030000;-1.367092e+01
4,4;0.040000;-1.393220e+01
5,5;0.050000;-1.384230e+01
6,6;0.060000;-1.396289e+01
7,7;0.070000;-1.410195e+01
8,8;0.080000;-1.372311e+01
9,9;0.090000;-1.334264e+01


In [None]:
client = boto3.client('sagemaker')
response = client.stop_processing_job(
    ProcessingJobName='openface-preprocessor-2020-02-14-14-44-35-346')

response

In [None]:
"""SMILExtract 
-C /root/opensmile-2.3.0/config/demo/demo1_energy.conf 
-I /root/opensmile-2.3.0/example-audio/opensmile.wav
-O ./speech01.energy.csv"""

#s3://opensmile-preprocess/input_data/media-interpretation.wav
#s3://opensmile-preprocess/input_data/opensmile.wav
#s3://opensmile-preprocess/input_data/config

In [29]:
%%writefile preprocess.py
#!/usr/bin/env python3

print('Script started processing')
import boto3
import os
import sys
import subprocess

stderr = 0

def main():
    print('Script started processing main')
    
    end_sync = 'aws s3 cp /tmp/processed_data s3://opensmile-preprocess/processed_data --recursive' 
    
    #'aws s3 cp s3://opensmile-preprocess/input_data /tmp/input_data --recursive'

    start_sync = 'aws s3 cp s3://opensmile-preprocess/input_data /tmp/input_data --recursive'
    
    subprocess.check_call(start_sync,shell = True)

    cmd = "{} -C {} -I {} -O {} ".format('SMILExtract','/tmp/input_data/config/demo/demo1_energy.conf' ,
                                         '/tmp/input_data/opensmile.wav','/tmp/processed_data/opensmile.energy.csv')

    #cmd = "SMILExtract -C /root/opensmile-2.3.0/config/demo/demo1_energy.conf -I /root/opensmile-2.3.0/example-audio/opensmile.wav -O /tmp/processed_data/opensmile.energy.csv"
    
    subprocess.check_call(cmd,shell = True)  

    cmd = "{} -C {} -I {} -O {} ".format('SMILExtract','/tmp/input_data/config/demo/demo1_energy.conf' ,
                                         '/tmp/input_data/media-interpretation.wav','/tmp/processed_data/media-inter.energy.csv')

    subprocess.check_call(cmd, shell = True)

    subprocess.check_call(end_sync,shell = True) 
    
    print('Script processing done')
    
if __name__ == "__main__":
    
    main()

Overwriting preprocess.py
