# Re-work `preprocess.py`

In [None]:
import boto3
from botocore.exceptions import ClientError

def get_featurestore_params(feature_group_name):
    try:
        sm = boto3.client('sagemaker')
        response = sm.describe_feature_group(
            FeatureGroupName='AbaloneFeatureGroup'
        )
        return response['OfflineStoreConfig']['DataCatalogConfig']['Database'], response['OfflineStoreConfig']['DataCatalogConfig']['TableName']
    except ClientError as e:
        error_message = e.response['Error']['Message']
        # raise Exception(error_message)
        print(error_message)

In [None]:
response = sm.list_feature_groups(
    NameContains='fuck'
)
if response['FeatureGroupSummaries'] == []:
    print("Not Found")
else:
    featuregroup_name = response['FeatureGroupSummaries'][0]['FeatureGroupName']

In [None]:
response

In [None]:
database, table = get_featurestore_params(featuregroup_name)

In [None]:
database

In [None]:
import subprocess
import sys

subprocess.call([sys.executable, "-m", "pip", "install", "pyarrow==2", "awswrangler==2.7.0"])

In [None]:
%%time
import awswrangler as wr

table_cols = [
    'rings',
    'length',
    'diameter',
    'height',
    'whole_weight',
    'shucked_weight',
    'viscera_weight',
    'shell_weight',
    'sex_F',
    'sex_I',
    'sex_M'
]
query_string = f'SELECT {",".join(header)} FROM "{table}"'
#wr.athena.read_sql_query(f'SELECT "{",".join(header)}" FROM "{table}"', database=database, ctas_approach=False)
featurestore_df = wr.athena.read_sql_query(query_string, database=database, ctas_approach=False)

In [None]:
featurestore_df

In [1]:
import argparse
import os
import requests
import tempfile
import numpy as np
import pandas as pd
import boto3
from botocore.exceptions import ClientError
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

sm = boto3.client('sagemaker')

# Since we get a headerless CSV file we specify the column names here.
feature_columns_names = [
    'sex',
    'length',
    'diameter',
    'height',
    'whole_weight',
    'shucked_weight',
    'viscera_weight',
    'shell_weight',
]
label_column = 'rings'

feature_columns_dtype = {
    'sex': str,
    'length': np.float64,
    'diameter': np.float64,
    'height': np.float64,
    'whole_weight': np.float64,
    'shucked_weight': np.float64,
    'viscera_weight': np.float64,
    'shell_weight': np.float64
}
label_column_dtype = {'rings': np.float64}


def confirm_featurestore(model_name):
    response = sm.list_feature_groups(
        NameContains=model_name
    )
    if response['FeatureGroupSummaries'] == []:
        return None
    else:
        return response['FeatureGroupSummaries'][0]['FeatureGroupName']


def get_featurestore_params(feature_group_name):
    try:
        response = sm.describe_feature_group(
            FeatureGroupName=feature_group_name
        )
        return response['OfflineStoreConfig']['DataCatalogConfig']['Database'], response['OfflineStoreConfig']['DataCatalogConfig']['TableName']
    except ClientError as e:
        error_message = e.response['Error']['Message']
        raise Exception(error_message)
#         print(error_message)

def merge_two_dicts(x, y):
    z = x.copy()
    z.update(y)
    return z


base_dir = './'



In [2]:
df = pd.read_csv(
    f'{base_dir}/abalone.csv',
    header=None, 
    names=feature_columns_names + [label_column],
    dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype)
)

numeric_features = list(feature_columns_names)
numeric_features.remove('sex')
numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

categorical_features = ['sex']
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)
    
y = df.pop('rings')
X_pre = preprocess.fit_transform(df)
y_pre = y.to_numpy().reshape(len(y), 1)
X = np.concatenate((y_pre, X_pre), axis=1)
np.random.shuffle(X)

In [3]:
# featurestore_name = confirm_featurestore(os.environ['MODEL_NAME'])
featurestore_name = confirm_featurestore('abalone')
if featurestore_name != None:
    import subprocess
    import sys
    
    subprocess.call([sys.executable, "-m", "pip", "install", "pyarrow==2", "awswrangler==2.7.0"])
    database, table = get_featurestore_params(featurestore_name)
    
    import awswrangler as wr
    
    table_cols = [
        'rings',
        'length',
        'diameter',
        'height',
        'whole_weight',
        'shucked_weight',
        'viscera_weight',
        'shell_weight',
        'sex_F',
        'sex_I',
        'sex_M'
    ]
    
    query_string = f'SELECT {",".join(table_cols)} FROM "{table}"'
    featurestore_df = wr.athena.read_sql_query(query_string, database=database, ctas_approach=False)
    raw_df = pd.DataFrame(X, columns=table_cols)
    X = pd.concat([raw_df, featurestore_df]).to_numpy()

In [4]:
X

array([[ 7.        , -0.57455813, -0.33137077, ...,  0.        ,
         0.        ,  1.        ],
       [11.        ,  0.6329849 ,  0.5252424 , ...,  0.        ,
         0.        ,  1.        ],
       [ 5.        , -1.44898585, -1.49031801, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 6.        , -1.21593601, -1.60478947, ...,  0.        ,
         1.        ,  0.        ],
       [ 6.        , -1.9763402 , -0.82727874, ...,  0.        ,
         1.        ,  0.        ],
       [14.        ,  0.13897866,  0.40168949, ...,  1.        ,
         0.        ,  0.        ]])

In [5]:
# (80%, 15%, 5%) train/validation/test split
training, validation, testing = np.split(X, [int(.8*len(X)), int(.95*len(X))])
header = [
    'rings',
    'length',
    'diameter',
    'height',
    'whole_weight',
    'shucked_weight',
    'viscera_weight',
    'shell_weight',
    'sex_F',
    'sex_I',
    'sex_M'
]

In [6]:
# Create the training and testing datasets for the SageMaker training Job
pd.DataFrame(training).to_csv(f'{base_dir}/training.csv', header=False, index=False)
pd.DataFrame(validation).to_csv(f'{base_dir}/validation.csv', header=False, index=False)
pd.DataFrame(testing).to_csv(f'{base_dir}/testing.csv', header=False, index=False)