# Notebook 05: SageMaker Feature Store

In [1]:
import os
import sys
import time
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum


sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/fatimatatanda/Library/Application Support/sagemaker/config.yaml


In [2]:
# Add project root to path
sys.path.append("..")

from config.config import (
    BUCKET_NAME,
    S3_PREFIX,
    AWS_REGION,
    FEATURE_GROUP_NAME
)

In [None]:
# Initialize SageMaker session
boto_session = boto3.Session(region_name=AWS_REGION)
sagemaker_session = Session(boto_session=boto_session)

# Get SageMaker execution role
try:
    role = sagemaker.get_execution_role()
except ValueError:
    # use local IAM role
    iam_client = boto3.client('iam')
    role = iam_client.get_role(RoleName='sagemaker-execution-role')['Role']['Arn']

print(f"Region: {AWS_REGION}")
print(f"Role: {role}")
print(f"Feature Group Name: {FEATURE_GROUP_NAME}")

Couldn't call 'get_role' to get Role ARN from role name fatimat-admin to get Role path.


Region: us-east-1
Role: arn:aws:iam::306617143793:role/sagemaker-execution-role
Feature Group Name: nfci-feature-group


## Load Training Features

In [4]:
# Load training data from S3
train_path = f"s3://{BUCKET_NAME}/{S3_PREFIX['train']}/features.parquet"
print(f"Loading from: {train_path}")

df_train = pd.read_parquet(train_path)
print(f"Shape: {df_train.shape}")
df_train.head()

Loading from: s3://nfci-forecasting-306617143793/data/splits/train/features.parquet
Shape: (6800, 80)


Unnamed: 0,state_fips,date,UNRATE,PAYEMS,CIVPART,EMRATIO,U6RATE,AWHMAN,AHETPI,CPIAUCSL,...,UNRATE_RMEAN12,UNRATE_RSTD12,NFCI_RMEAN3,NFCI_RSTD3,NFCI_RMEAN6,NFCI_RSTD6,NFCI_RMEAN12,NFCI_RSTD12,NFCI_HORIZON6,NFCI_HORIZON12
0,1,2005-01-01,5.3,132781,65.8,62.4,9.2,40.7,15.9,191.6,...,5.3,0.0,-0.70536,0.0,-0.70536,0.0,-0.70536,0.0,-0.61572,-0.60428
1,1,2005-02-01,5.4,133033,65.9,62.4,9.2,40.7,15.93,192.4,...,5.3,0.0,-0.70536,0.0,-0.70536,0.0,-0.70536,0.0,-0.60043,-0.63435
2,1,2005-03-01,5.2,133152,65.9,62.4,9.1,40.4,15.97,193.1,...,5.35,0.070711,-0.70906,0.005233,-0.70906,0.005233,-0.70906,0.005233,-0.56827,-0.62978
3,1,2005-04-01,5.2,133519,66.1,62.7,9.0,40.4,16.01,193.7,...,5.3,0.1,-0.695253,0.024198,-0.695253,0.024198,-0.695253,0.024198,-0.58529,-0.62065
4,1,2005-05-01,5.1,133689,66.1,62.8,8.9,40.4,16.03,193.6,...,5.275,0.095743,-0.66398,0.050709,-0.674325,0.046286,-0.674325,0.046286,-0.58353,-0.58318


# Prep Data for Feature Store

In [7]:
def prepare_for_feature_store(df):
    df = df.copy()
    
    # Standardize column names to lowercase
    df.columns = df.columns.str.lower()
    
    # Create unique record ID
    df['record_id'] = df['state_fips'].astype(str) + '_' + df['date'].dt.strftime('%Y%m%d')
    
    # Create event time
    df['event_time'] = df['date'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
    
    # Convert date to string
    df['date'] = df['date'].dt.strftime('%Y-%m-%d')
    
    # Ensure state_fips is string
    df['state_fips'] = df['state_fips'].astype(str)
    
    # Reorder columns
    cols = ['record_id', 'event_time'] + [c for c in df.columns if c not in ['record_id', 'event_time']]
    df = df[cols]
    
    return df

In [9]:
# Prepare training data
df_features = prepare_for_feature_store(df_train)

print(f"Shape: {df_features.shape}")
print(f"\nFirst few columns: {df_features.columns[:10].tolist()}")
print(f"\nSample record_id: {df_features['record_id'].iloc[0]}")
print(f"Sample event_time: {df_features['event_time'].iloc[0]}")

Shape: (6800, 82)

First few columns: ['record_id', 'event_time', 'state_fips', 'date', 'unrate', 'payems', 'civpart', 'emratio', 'u6rate', 'awhman']

Sample record_id: 1_20050101
Sample event_time: 2005-01-01T00:00:00Z


In [10]:
# Preview the prepared data
df_features.head()

Unnamed: 0,record_id,event_time,state_fips,date,unrate,payems,civpart,emratio,u6rate,awhman,...,unrate_rmean12,unrate_rstd12,nfci_rmean3,nfci_rstd3,nfci_rmean6,nfci_rstd6,nfci_rmean12,nfci_rstd12,nfci_horizon6,nfci_horizon12
0,1_20050101,2005-01-01T00:00:00Z,1,2005-01-01,5.3,132781,65.8,62.4,9.2,40.7,...,5.3,0.0,-0.70536,0.0,-0.70536,0.0,-0.70536,0.0,-0.61572,-0.60428
1,1_20050201,2005-02-01T00:00:00Z,1,2005-02-01,5.4,133033,65.9,62.4,9.2,40.7,...,5.3,0.0,-0.70536,0.0,-0.70536,0.0,-0.70536,0.0,-0.60043,-0.63435
2,1_20050301,2005-03-01T00:00:00Z,1,2005-03-01,5.2,133152,65.9,62.4,9.1,40.4,...,5.35,0.070711,-0.70906,0.005233,-0.70906,0.005233,-0.70906,0.005233,-0.56827,-0.62978
3,1_20050401,2005-04-01T00:00:00Z,1,2005-04-01,5.2,133519,66.1,62.7,9.0,40.4,...,5.3,0.1,-0.695253,0.024198,-0.695253,0.024198,-0.695253,0.024198,-0.58529,-0.62065
4,1_20050501,2005-05-01T00:00:00Z,1,2005-05-01,5.1,133689,66.1,62.8,8.9,40.4,...,5.275,0.095743,-0.66398,0.050709,-0.674325,0.046286,-0.674325,0.046286,-0.58353,-0.58318


## Feature Definitions

In [11]:
def get_feature_definitions(df):
    """
    Create Feature Definitions from DataFrame dtypes.
    """
    feature_definitions = []
    
    for col in df.columns:
        dtype = df[col].dtype
        
        if dtype == 'object' or dtype.name == 'string':
            feature_type = FeatureTypeEnum.STRING
        elif 'int' in str(dtype):
            feature_type = FeatureTypeEnum.INTEGRAL
        elif 'float' in str(dtype):
            feature_type = FeatureTypeEnum.FRACTIONAL
        else:
            # default to string for unknown types
            feature_type = FeatureTypeEnum.STRING
        
        feature_definitions.append(
            FeatureDefinition(feature_name=col, feature_type=feature_type)
        )
    
    return feature_definitions

In [12]:
# Get feature definitions
feature_definitions = get_feature_definitions(df_features)

print(f"Total features: {len(feature_definitions)}")
print(f"\nFirst 10 feature definitions:")
for fd in feature_definitions[:10]:
    print(f"  {fd.feature_name}: {fd.feature_type.value}")

Total features: 82

First 10 feature definitions:
  record_id: String
  event_time: String
  state_fips: String
  date: String
  unrate: Fractional
  payems: Integral
  civpart: Fractional
  emratio: Fractional
  u6rate: Fractional
  awhman: Fractional


## Create Feature Group

In [13]:
# Create Feature Group object
feature_group = FeatureGroup(
    name=FEATURE_GROUP_NAME,
    sagemaker_session=sagemaker_session
)

print(f"Feature Group: {feature_group.name}")

Feature Group: nfci-feature-group


In [15]:
# Check if Feature Group already exists
def feature_group_exists(feature_group_name):
    """Check if a Feature Group already exists."""
    sm_client = boto3.client('sagemaker', region_name=AWS_REGION)
    try:
        sm_client.describe_feature_group(FeatureGroupName=feature_group_name)
        return True
    except sm_client.exceptions.ResourceNotFound:
        return False

if feature_group_exists(FEATURE_GROUP_NAME):
    print(f"Feature Group '{FEATURE_GROUP_NAME}' already exists")
else:
    print(f"Feature Group '{FEATURE_GROUP_NAME}' does not exist")

Feature Group 'nfci-feature-group' does not exist


In [16]:
# S3 location for offline store
offline_store_s3_uri = f"s3://{BUCKET_NAME}/{S3_PREFIX['features']}"
print(f"Offline store location: {offline_store_s3_uri}")

Offline store location: s3://nfci-forecasting-306617143793/features


In [17]:
# Create the Feature Group (if it doesn't exist)
if not feature_group_exists(FEATURE_GROUP_NAME):
    feature_group.load_feature_definitions(data_frame=df_features)
    
    feature_group.create(
        s3_uri=offline_store_s3_uri,
        record_identifier_name='record_id',
        event_time_feature_name='event_time',
        role_arn=role,
        enable_online_store=False,  # Offline only for batch processing
        description='NFCI forecasting features derived from FRED and Census data'
    )
    print(f"Feature Group '{FEATURE_GROUP_NAME}' creation initiated")
else:
    print(f"Feature Group '{FEATURE_GROUP_NAME}' already exists")

Feature Group 'nfci-feature-group' creation initiated


In [18]:
# Wait for Feature Group to be created
def wait_for_feature_group(feature_group, timeout_minutes=5):
    """Wait for Feature Group to be in 'Created' status."""
    status = feature_group.describe().get('FeatureGroupStatus')
    print(f"Status: {status}")
    
    timeout_seconds = timeout_minutes * 60
    start_time = time.time()
    
    while status == 'Creating':
        elapsed = time.time() - start_time
        if elapsed > timeout_seconds:
            print(f"Timeout after {timeout_minutes} minutes")
            break
        
        print(f"  Waiting... ({int(elapsed)}s)", end='\r')
        time.sleep(10)
        status = feature_group.describe().get('FeatureGroupStatus')
    
    print(f"\nFinal status: {status}")
    return status == 'Created'

wait_for_feature_group(feature_group)

Status: Creating
  Waiting... (0s)
Final status: Created


True

## Ingest Features into Feature Store
- load features into the Feature Group.

In [19]:
# Ingest features into Feature Group
print(f"Ingesting {len(df_features):,} records...")

# Ingest in batches for better performance
feature_group.ingest(
    data_frame=df_features,
    max_workers=3,
    wait=True
)

print(f"Ingestion complete")

Ingesting 6,800 records...
Ingestion complete


## Verify Ingestion

In [20]:
# Query Feature Store using Athena
feature_store_query = feature_group.athena_query()

# Get the table name
table_name = feature_store_query.table_name
print(f"Athena table: {table_name}")

Athena table: nfci_feature_group_1769986015


### Run a simple query

In [21]:
# Run a simple count query
query_string = f"""
SELECT COUNT(*) as record_count
FROM "{table_name}"
"""

print(f"Running query: {query_string}")

feature_store_query.run(
    query_string=query_string,
    output_location=f"s3://{BUCKET_NAME}/{S3_PREFIX['athena_results']}/"
)

feature_store_query.wait()
query_result = feature_store_query.as_dataframe()
print(f"\nRecords in Feature Store: {query_result['record_count'].iloc[0]:,}")

Running query: 
SELECT COUNT(*) as record_count
FROM "nfci_feature_group_1769986015"


Records in Feature Store: 6,800


## Retrieve Features for Training

In [22]:
def get_training_data_from_feature_store(feature_group, output_s3_uri, feature_columns=None):
    """
    Retrieve training data from Feature Store.
    
    Parameters:
        feature_group: FeatureGroup object
        output_s3_uri: S3 location for query results
        feature_columns: List of columns to retrieve (None = all)
    
    Returns:
        DataFrame with features
    """
    query = feature_group.athena_query()
    table_name = query.table_name
    
    # Build column selection
    if feature_columns:
        columns = ', '.join(feature_columns)
    else:
        columns = '*'
    
    query_string = f'SELECT {columns} FROM "{table_name}"'
    
    query.run(
        query_string=query_string,
        output_location=output_s3_uri
    )
    query.wait()
    
    return query.as_dataframe()

In [23]:
# Retrieve all features
print("Retrieving features from Feature Store...")

df_from_fs = get_training_data_from_feature_store(
    feature_group=feature_group,
    output_s3_uri=f"s3://{BUCKET_NAME}/{S3_PREFIX['athena_results']}/"
)

print(f"Retrieved {len(df_from_fs):,} records with {len(df_from_fs.columns)} columns")
df_from_fs.head()

Retrieving features from Feature Store...
Retrieved 6,800 records with 85 columns


Unnamed: 0,record_id,event_time,state_fips,date,unrate,payems,civpart,emratio,u6rate,awhman,...,nfci_rstd3,nfci_rmean6,nfci_rstd6,nfci_rmean12,nfci_rstd12,nfci_horizon6,nfci_horizon12,write_time,api_invocation_time,is_deleted
0,15_20051001,2005-10-01T00:00:00Z,15,2005-10-01,5.0,134643,66.1,62.8,8.7,41.0,...,0.02422,-0.600612,0.016893,-0.632159,0.050636,-0.62065,-0.62929,2026-02-01 22:53:46.940,2026-02-01 22:49:55.000,False
1,38_20051001,2005-10-01T00:00:00Z,38,2005-10-01,5.0,134643,66.1,62.8,8.7,41.0,...,0.02422,-0.600612,0.016893,-0.632159,0.050636,-0.62065,-0.62929,2026-02-01 22:53:46.940,2026-02-01 22:50:38.000,False
2,41_20070401,2007-04-01T00:00:00Z,41,2007-04-01,4.5,137845,65.9,63.0,8.2,41.3,...,0.039061,-0.641245,0.027079,-0.612699,0.041862,-0.00181,0.64595,2026-02-01 22:53:46.610,2026-02-01 22:49:05.000,False
3,5_20070501,2007-05-01T00:00:00Z,5,2007-05-01,4.4,137993,66.0,63.0,8.2,41.1,...,0.039129,-0.634512,0.034619,-0.610052,0.042315,0.47973,0.50079,2026-02-01 22:53:46.935,2026-02-01 22:49:08.000,False
4,38_20070501,2007-05-01T00:00:00Z,38,2007-05-01,4.4,137993,66.0,63.0,8.2,41.1,...,0.039129,-0.634512,0.034619,-0.610052,0.042315,0.47973,0.50079,2026-02-01 22:53:46.935,2026-02-01 22:50:39.000,False
