In [2]:
# Step 1: Setup the SageMaker Feature Store Environment
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup
import pandas as pd
import numpy as np
import time



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
region = boto3.Session().region_name
boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(service_name="sagemaker-featurestore-runtime", region_name=region)
sagemaker_session = sagemaker.Session(boto_session=boto_session, sagemaker_client=sagemaker_client)
role = get_execution_role()



In [4]:
# Step 2: Load datasets
housing_data = pd.read_csv("housing.csv")
gmaps_data = pd.read_csv("housing_gmaps_data_raw.csv")



In [5]:
# Step 3: Data Imputation
gmaps_data['neighborhood-political'].fillna(method='ffill', inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  gmaps_data['neighborhood-political'].fillna(method='ffill', inplace=True)
  gmaps_data['neighborhood-political'].fillna(method='ffill', inplace=True)


In [6]:
# Step 4: Merge datasets
housing_data = housing_data.merge(gmaps_data[['latitude', 'longitude', 'neighborhood-political']], 
                                  on=['latitude', 'longitude'], how='left')
housing_data['neighborhood'] = housing_data['neighborhood-political']



In [7]:
# Step 5: Feature Engineering
housing_data['event_time'] = pd.to_datetime('now').strftime('%Y-%m-%dT%H:%M:%SZ')
housing_data['1h_ocean'] = np.where(housing_data['ocean_proximity'] == '<1H OCEAN', 1, 0)
housing_data['inland'] = np.where(housing_data['ocean_proximity'] == 'INLAND', 1, 0)
housing_data['island'] = np.where(housing_data['ocean_proximity'] == 'ISLAND', 1, 0)
housing_data['near_bay'] = np.where(housing_data['ocean_proximity'] == 'NEAR BAY', 1, 0)
housing_data['near_ocean'] = np.where(housing_data['ocean_proximity'] == 'NEAR OCEAN', 1, 0)

housing_data['median_house_value'] = np.where(housing_data['median_house_value'] > 500000, 500000, housing_data['median_house_value'])

housing_data['median_house_age_group'] = (housing_data['housing_median_age'] // 10) * 10
housing_data['total_households'] = housing_data['households'].round().astype(int)
housing_data['bedrooms_per_household'] = housing_data['total_bedrooms'] / housing_data['households']
housing_data['bedrooms_per_household'].fillna(housing_data['bedrooms_per_household'].mean(), inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  housing_data['bedrooms_per_household'].fillna(housing_data['bedrooms_per_household'].mean(), inplace=True)


In [11]:
from sagemaker.feature_store.feature_definition import FeatureDefinition, IntegralFeatureDefinition, FractionalFeatureDefinition, StringFeatureDefinition

# Define Feature Definitions with correct feature type classes
feature_definitions = [
    StringFeatureDefinition(feature_name="neighborhood"),
    StringFeatureDefinition(feature_name="event_time"),
    IntegralFeatureDefinition(feature_name="1h_ocean"),
    IntegralFeatureDefinition(feature_name="inland"),
    IntegralFeatureDefinition(feature_name="island"),
    IntegralFeatureDefinition(feature_name="near_bay"),
    IntegralFeatureDefinition(feature_name="near_ocean"),
    FractionalFeatureDefinition(feature_name="median_house_value"),
    IntegralFeatureDefinition(feature_name="median_house_age_group"),
    IntegralFeatureDefinition(feature_name="total_households"),
    FractionalFeatureDefinition(feature_name="bedrooms_per_household")
]

# Create Feature Group with corrected Feature Definitions
neighborhood_feature_group = FeatureGroup(
    name="neighborhood-feature-group",
    feature_definitions=feature_definitions,
    sagemaker_session=sagemaker_session
)

neighborhood_feature_group.create(
    s3_uri=f"s3://sagemaker-us-east-1-940482417425/homework/neighborhood_feature_store",
    record_identifier_name='neighborhood',
    event_time_feature_name='event_time',
    role_arn=role,
    enable_online_store=True
)


ClientError: An error occurred (ValidationException) when calling the CreateFeatureGroup operation: The execution role ARN is invalid. Please ensure that the role exists and that its trust relationship policy allows the action 'sts:AssumeRole' for the service principal 'sagemaker.amazonaws.com'.

In [None]:
# Step 7: Ingest Data into Feature Group
# Ingest data into feature group
neighborhood_feature_group.ingest(data_frame=housing_data, max_workers=3, wait=True)

# Confirm ingestion
featurestore_runtime.get_record(
    FeatureGroupName="neighborhood-feature-group",
    RecordIdentifierValueAsString="Fisherman’s Wharf"
)


In [None]:
# Define queries to extract feature values for neighborhoods

query_results = []

# List of neighborhoods to query
neighborhoods = ["Brooktree", "Fisherman’s Wharf", "Los Osos"]

for neighborhood in neighborhoods:
    result = featurestore_runtime.get_record(
        FeatureGroupName="neighborhood-feature-group",
        RecordIdentifierValueAsString=neighborhood
    )
    query_results.append(result)

# Display results
for result in query_results:
    print(result)
