# Querying data from the offline store of SageMaker Feature Store and uploading it to Amazon S3

<img align="left" width="130" src="https://raw.githubusercontent.com/PacktPublishing/Amazon-SageMaker-Cookbook/master/Extra/cover-small-padded.png"/>

This notebook contains the code to help readers work through one of the recipes of the book [Machine Learning with Amazon SageMaker Cookbook: 80 proven recipes for data scientists and developers to perform ML experiments and deployments](https://www.amazon.com/Machine-Learning-Amazon-SageMaker-Cookbook/dp/1800567030)

### How to do it...

In [None]:
import boto3
import sagemaker

from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup

In [None]:
region = boto3.Session().region_name
session = boto3.Session(region_name=region)

client = session.client(
    service_name='sagemaker', 
    region_name=region
)
runtime = session.client(
    service_name='sagemaker-featurestore-runtime', 
    region_name=region
)

feature_store_session = Session(
    boto_session=session,
    sagemaker_client=client,
    sagemaker_featurestore_runtime_client=runtime
)

In [None]:
%store -r feature_group_name

feature_group = FeatureGroup(
    name=feature_group_name, 
    sagemaker_session=feature_store_session
)

In [None]:
feature_group.describe()

In [None]:
table = feature_group.athena_query().table_name
table

In [None]:
describe_response = feature_group.describe()
offline_config = describe_response['OfflineStoreConfig']
s3_uri = offline_config['S3StorageConfig']['S3Uri']
s3_uri

In [None]:
!aws s3 ls {s3_uri} --recursive

In [None]:
%store -r s3_bucket_name
%store -r prefix

In [None]:
base = f's3://{s3_bucket_name}/{prefix}'
output_location = f'{base}/query_results/'

print(output_location)

In [None]:
def query_data(query_string):
    print(f"QUERY: {query_string}\n")
    query = feature_group.athena_query()
    query.run(query_string=query_string, 
              output_location=output_location)
    
    query.wait()
    
    return query.as_dataframe()

In [None]:
from time import sleep

# wait for 5 minutes for the offline store to be ready
sleep(5 * 60)

In [None]:
query = f"""SELECT approved, sex, math, science, technology, random1, random2 FROM "{table}" ORDER BY index ASC LIMIT 600"""

training_df = query_data(query)
training_df

In [None]:
query = f"""SELECT approved, sex, math, science, technology, random1, random2 FROM "{table}" WHERE index > 600 ORDER BY index ASC LIMIT 200"""

validation_df = query_data(query)
validation_df

In [None]:
query = f"""SELECT approved, sex, math, science, technology, random1, random2 FROM "{table}" WHERE index > 800 ORDER BY index ASC LIMIT 200 """

test_df = query_data(query)
test_df

In [None]:
!mkdir -p tmp

In [None]:
training_df.to_csv('tmp/training_data.csv', 
                   header=True, 
                   index=False)
validation_df.to_csv('tmp/validation_data.csv', 
                     header=True, 
                     index=False)
test_df.to_csv('tmp/test_data.csv', 
               header=True, 
               index=False)

In [None]:
path = f"s3://{s3_bucket_name}/{prefix}"
training_data_path = f"{path}/input/training_data.csv"
validation_data_path = f"{path}/input/validation_data.csv"
test_data_path = f"{path}/input/test_data.csv"

In [None]:
!aws s3 cp tmp/training_data.csv {training_data_path}
!aws s3 cp tmp/validation_data.csv {validation_data_path}
!aws s3 cp tmp/test_data.csv {test_data_path}

In [None]:
training_df.to_csv('tmp/training_data_no_header.csv', 
                   header=False, 
                   index=False)
validation_df.to_csv('tmp/validation_data_no_header.csv', 
                     header=False, 
                     index=False)
test_df.to_csv('tmp/test_data_no_header.csv', 
               header=False, 
               index=False)

In [None]:
training_data_path_nh = f"{path}/input/training_data_no_header.csv"
validation_data_path_nh = f"{path}/input/validation_data_no_header.csv"
test_data_path_nh = f"{path}/input/test_data_no_header.csv"

In [None]:
!aws s3 cp tmp/training_data_no_header.csv {training_data_path_nh}
!aws s3 cp tmp/validation_data_no_header.csv {validation_data_path_nh}
!aws s3 cp tmp/test_data_no_header.csv {test_data_path_nh}

In [None]:
%store training_data_path
%store validation_data_path
%store test_data_path
%store training_data_path_nh
%store validation_data_path_nh
%store test_data_path_nh