In [None]:
import boto3
import time


region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

athena_client = boto_session.client(service_name="athena", region_name=region)

default_s3_bucket_name = "s3://tim-test-mlops/query_results"

In [15]:
query = "select id, sum(payment) from \"2024\" group by id;"

query_response = athena_client.start_query_execution(
    QueryString=query,
    QueryExecutionContext={"Database": "tim-test-db"},
    ResultConfiguration={
        "OutputLocation": default_s3_bucket_name,
    },
)

print(query)
print(query_response)

# Check query execution status (simple example)
query_execution_id = query_response['QueryExecutionId']
while True:
    query_status = athena_client.get_query_execution(QueryExecutionId=query_execution_id)
    query_execution_status = query_status['QueryExecution']['Status']['State']
    if query_execution_status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
        print(f"Query Execution Status: {query_execution_status}")
        if query_execution_status == 'FAILED':
            error_message = athena_client.get_query_execution(QueryExecutionId=query_execution_id)['QueryExecution']['Status']['StateChangeReason']
            print(f"Query Failed: {error_message}")
        break
    else:
        print(f"Query is still running...")
        time.sleep(5)

select id, sum(payment) from "2024" group by id;
{'QueryExecutionId': '3545220d-bc6a-4095-9de1-1bbe952b45f2', 'ResponseMetadata': {'RequestId': '9b10d929-4e4f-441a-a1d2-a0c3bbec900a', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Mon, 01 Apr 2024 21:19:26 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '59', 'connection': 'keep-alive', 'x-amzn-requestid': '9b10d929-4e4f-441a-a1d2-a0c3bbec900a'}, 'RetryAttempts': 0}}
Query is still running...
Query Execution Status: SUCCEEDED


In [16]:
import pandas as pd

S3_BUCKET_NAME = "tim-test-mlops"
S3_OUTPUT_DIRECTORY = "query_results"
temp_file_location: str = "./athena_query_results.csv"
s3_client = boto3.client(
    service_name="s3",
    region_name=region,
)

s3_client.download_file(
    S3_BUCKET_NAME,
    f"{S3_OUTPUT_DIRECTORY}/{query_response['QueryExecutionId']}.csv",
    temp_file_location,
)
df = pd.read_csv(temp_file_location)

In [17]:
import sagemaker
from sagemaker.session import Session


region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [27]:
# read dynamodb feature
dynamodb = boto3.client('dynamodb')

try:
    response = dynamodb.list_tables()
    table_names = response.get('TableNames', [])
    print("Available tables:", table_names)

    # Check if the specific table exists
    if "2024" in table_names:
        print("Table '2024' exists.")
    else:
        print("Table '2024' does not exist.")

except Exception as e:
    print("An error occurred:", str(e))

record = dynamodb.get_item(TableName="customer", Key={'id':{'S':'1'}})

Available tables: ['customer']
Table '2024' does not exist.


In [28]:
children=record['Item']['children']['N']
children

'2'

In [29]:
!pip install pandas



In [30]:
# Offline feature from athena
import pandas as pd
import json
churn = pd.read_csv("athena_query_results.csv")

In [31]:
id = churn['id'].values[0]
print(id)

1


In [32]:
payments = churn["_col1"].values[0]
payments

2

In [33]:
from sagemaker.session import Session
region = boto3.Session().region_name
sagemaker_session = sagemaker.Session()
boto_session = boto3.Session(region_name=region)

In [41]:
# write fo feature store
from sagemaker.feature_store.feature_group import FeatureGroup
user_feature_group = FeatureGroup(
    name="customer", sagemaker_session=sagemaker_session
)
data = [[str(id), int(children), payments, '2023-07-01T14:59:55.711Z']]
df = pd.DataFrame(data, columns=['id', 'children', 'payments', 'time'])
df

Unnamed: 0,id,children,payments,time
0,1,2,2,2023-07-01T14:59:55.711Z


In [42]:
sagemaker_client = boto3.client('sagemaker')
response = sagemaker_client.list_feature_groups()
feature_groups = response['FeatureGroupSummaries']
print([fg['FeatureGroupName'] for fg in feature_groups])

['customer']


In [43]:
user_feature_group.ingest(data_frame=df, max_workers=3, wait=True)

IngestionManagerPandas(feature_group_name='customer', feature_definitions={'children': {'FeatureName': 'children', 'FeatureType': 'String'}, 'payments': {'FeatureName': 'payments', 'FeatureType': 'Integral'}, 'id': {'FeatureName': 'id', 'FeatureType': 'Integral'}, 'time': {'FeatureName': 'time', 'FeatureType': 'String'}}, sagemaker_fs_runtime_client_config=<botocore.config.Config object at 0x7f895942a8f0>, sagemaker_session=<sagemaker.session.Session object at 0x7f8959d62800>, max_workers=3, max_processes=1, profile_name=None, _async_result=<multiprocess.pool.MapResult object at 0x7f8958cb0790>, _processing_pool=<pool ProcessPool(ncpus=1)>, _failed_indices=[])

In [44]:
# read from feature store
record_identifier_value = str(1)

record = featurestore_runtime.get_record(
    FeatureGroupName="customer",
    RecordIdentifierValueAsString=record_identifier_value,
)
record

{'ResponseMetadata': {'RequestId': '462f2d3a-b8b1-4d66-b5a3-7eb232196604',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '462f2d3a-b8b1-4d66-b5a3-7eb232196604',
   'content-type': 'application/json',
   'content-length': '330',
   'date': 'Tue, 02 Apr 2024 00:45:10 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'children', 'ValueAsString': '2'},
  {'FeatureName': 'payments', 'ValueAsString': '2'},
  {'FeatureName': 'id', 'ValueAsString': '1'},
  {'FeatureName': 'time', 'ValueAsString': '2023-07-01T14:59:55.711Z'}]}

In [45]:
record['Record'][1]['ValueAsString']

'2'