In [1]:
import boto3

In [3]:
athena = boto3.client('athena', region_name='us-east-1')

In [4]:
athena_results_bucket = "cookbook-athena-results"

!aws s3 mb s3://{athena_results_bucket}

make_bucket: cookbook-athena-results


In [5]:
query = "SELECT label, value FROM cookbook_athena_db.athena_table;"
database = "cookbook_athena_db"
results_bucket = "s3://" + athena_results_bucket

In [14]:
def execute_athena_query(query, database, results_bucket):
    response = athena.start_query_execution(
        QueryString = query,
        QueryExecutionContext = {
            'Database' : database
        },
        ResultConfiguration = {
            'OutputLocation': results_bucket
        }
    )
    
    return response['QueryExecutionId']

In [15]:
def get_output_path(execution_id):
    query_details = athena.get_query_execution(
        QueryExecutionId = execution_id
    )
    
    execution = query_details['QueryExecution']
    configuration = execution['ResultConfiguration']
    return configuration['OutputLocation']

In [17]:
execution_id = execute_athena_query(query, database, results_bucket)
output_path = get_output_path(execution_id)
output_path

's3://cookbook-athena-results/64957fbb-b873-48ec-91aa-7377343da412.csv'

In [10]:
!mkdir -p tmp

In [11]:
!aws s3 cp {output_path} tmp/output.csv

Completed 16.8 KiB/16.8 KiB (360.0 KiB/s) with 1 file(s) remainingdownload: s3://cookbook-athena-results/97d9da6f-6426-46a1-b775-c96580fd29f4.csv to tmp/output.csv


In [12]:
import pandas as pd

pd.read_csv("tmp/output.csv")

Unnamed: 0,label,value
0,GCWKINKQYM,6
1,LTBDZXYYZB,78
2,UIRHTTIVJQ,10
3,QFIKGEMAYH,4
4,OSUQRHPDQX,3
...,...,...
995,QCWDWGUFQL,7
996,SCOWIGYIHW,10
997,ZUFCRCVYMD,4
998,FIWVSQWNJS,9


In [18]:
query = """
USING EXTERNAL FUNCTION detect_anomaly(value INT)
    RETURNS DOUBLE
    SAGEMAKER 'sagemaker-cookbook-rcf'
SELECT label, value, detect_anomaly(value) AS anomaly_score
    FROM cookbook_athena_db.athena_table
"""

In [19]:
execution_id = execute_athena_query(query, database, results_bucket)
output_path = get_output_path(execution_id)
output_path

's3://cookbook-athena-results/d457328e-b456-4d11-a012-6ea26a22ceb9.csv'

In [20]:
!aws s3 cp {output_path} tmp/output.csv

Completed 37.0 KiB/37.0 KiB (777.9 KiB/s) with 1 file(s) remainingdownload: s3://cookbook-athena-results/d457328e-b456-4d11-a012-6ea26a22ceb9.csv to tmp/output.csv


In [22]:
df = pd.read_csv("tmp/output.csv")
df

Unnamed: 0,label,value,anomaly_score
0,TWQNHWXFHX,3,0.931371
1,DAVLHEUSFA,10,1.311903
2,DGOPPHCDLB,10,1.311903
3,THNNUOYJVZ,6,0.828076
4,FVHAEGAHGQ,10,1.311903
...,...,...,...
995,WRMIRAXDUP,10,1.311903
996,QWNYXWMTNZ,5,0.848408
997,RRBZBPZEOW,3,0.931371
998,WUYBRZQEXF,76,2.603616


In [25]:
len(df[df.anomaly_score > 2])

47