In [4]:
# This notebook documents computation of summary statistics from a dataset read from S3 bucket

# Pandas dependency is handled with native AWS layer

In [5]:
import boto3
import io
import json
import pandas as pd

# S3 does not require authentication on Lambda because you are already signed in to the management console.
# For local testing, make sure AWS CLI is configured with access keys :)
s3 = boto3.client('s3')

# Bucket and key configuration
SUMMARY_STATISTICS_BUCKET = 'dataflow-summary-statistics-bucket'
DATASET_BUCKET = 'dataflow-development-bucket'

def lambda_handler(event, context):
    DATASET_KEY = event["Records"][0]['s3']['object']['key']
    SUMMARY_STATISTICS_KEY = 'summary/' + (DATASET_KEY.split('/')[1].split('.csv')[0]) + '.json'

    # This reads the object stream and decodes into a string to be read as a CSV into pd dataframe
    csv = s3.get_object(Bucket=DATASET_BUCKET, Key=DATASET_KEY)['Body'].read().decode('utf-8')
    csv = io.StringIO(csv)
    df = pd.read_csv(csv)

    response = {}

    # Compute numeric summary statistics first
    summary = df.select_dtypes(include=['number']).describe()
    for col in summary.columns:
        response[col] = dict(summary[col])

    for col in df.columns:
        if col not in summary.columns:
            response[col] = {
                'avg_length': df[col].str.len().mean(), # Average length
                'avg_words': df[col].str.split(' ').str.len().mean(), # Average words
                'avg_capitals': df[col].str.count(pat='[A-Z]').mean(), # Average capitals
                'avg_symbols': df[col].str.count(pat='[^a-zA-Z0-9\s]').mean() # Average symbols
            }

    json_data = json.dumps(response)

    # Upload to s3 summary statistics bucket
    response = s3.put_object(Key=SUMMARY_STATISTICS_KEY, Bucket=SUMMARY_STATISTICS_BUCKET, Body=json_data)

    if response['ResponseMetadata']['HTTPStatusCode'] == 200:
        print('Summary statistics successfully computed and uploaded')


In [6]:
# This documents the request object when a dataset is uploaded to s3 for lambda
lambda_event = {'Records': [{'eventVersion': '2.1', 'eventSource': 'aws:s3', 'awsRegion': 'us-west-1', 'eventTime': '2023-09-02T03:13:17.071Z', 'eventName': 'ObjectCreated:Put', 'userIdentity': {'principalId': 'AWS:AIDAT5UFTAUENG3VC6BG4'}, 'requestParameters': {'sourceIPAddress': '73.241.55.230'}, 'responseElements': {'x-amz-request-id': '23M5Q21SR7TSB9M3', 'x-amz-id-2': 'gQEtKKIEBihslEQIYBhIQ0ASFFErCiI+9LaItdrdUxeDfPYmbdO/eD8N3/z2QM/Dn1rXPsxgznCn5hrmdehUjOhB0zpP9fCG'}, 's3': {'s3SchemaVersion': '1.0', 'configurationId': 'd0807870-565d-460c-b4f8-b7fd2a6e151a', 'bucket': {'name': 'dataflow-development-bucket', 'ownerIdentity': {'principalId': 'A4GK0NDNF3RJ3'}, 'arn': 'arn:aws:s3:::dataflow-development-bucket'}, 'object': {'key': 'datasets/test.csv', 'size': 89, 'eTag': '6e1905bb12538fc6d234be99c40e1193', 'sequencer': '0064F2A84D0CB3501C'}}}]}
lambda_event

{'Records': [{'eventVersion': '2.1',
   'eventSource': 'aws:s3',
   'awsRegion': 'us-west-1',
   'eventTime': '2023-09-02T03:13:17.071Z',
   'eventName': 'ObjectCreated:Put',
   'userIdentity': {'principalId': 'AWS:AIDAT5UFTAUENG3VC6BG4'},
   'requestParameters': {'sourceIPAddress': '73.241.55.230'},
   'responseElements': {'x-amz-request-id': '23M5Q21SR7TSB9M3',
    'x-amz-id-2': 'gQEtKKIEBihslEQIYBhIQ0ASFFErCiI+9LaItdrdUxeDfPYmbdO/eD8N3/z2QM/Dn1rXPsxgznCn5hrmdehUjOhB0zpP9fCG'},
   's3': {'s3SchemaVersion': '1.0',
    'configurationId': 'd0807870-565d-460c-b4f8-b7fd2a6e151a',
    'bucket': {'name': 'dataflow-development-bucket',
     'ownerIdentity': {'principalId': 'A4GK0NDNF3RJ3'},
     'arn': 'arn:aws:s3:::dataflow-development-bucket'},
    'object': {'key': 'datasets/test.csv',
     'size': 89,
     'eTag': '6e1905bb12538fc6d234be99c40e1193',
     'sequencer': '0064F2A84D0CB3501C'}}}]}

In [7]:
# This invokes the lambda handler and simulates a lambda trigger
lambda_handler(lambda_event, "")

NoCredentialsError: Unable to locate credentials