### Notebook 05: Feature Store

1.	Load production-ready engineered features from S3
2.	Prepare Feature Store–compatible schema (record identifier and event time)
3.	Create or recreate SageMaker Feature Group (offline + online store)
4.	Ingest production feature data into the Feature Store
5.	Validate successful ingestion via Feature Group status and sample records

In [21]:
import os
from dotenv import load_dotenv

# Load .env from project root
load_dotenv()

ROLE_ARN = os.getenv("SAGEMAKER_ROLE_ARN")

if not ROLE_ARN:
    raise RuntimeError(
        "SAGEMAKER_ROLE_ARN not found.\n"
        "Ensure it is set in .env or environment variables."
    )

print("Using ROLE_ARN:", ROLE_ARN)

Using ROLE_ARN: arn:aws:iam::222634372778:role/service-role/AmazonSageMaker-ExecutionRole-20250603T080776


In [None]:
import sys
import time
import boto3
import pandas as pd
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from config.config import SAGEMAKER_ROLE_NAME

# project imports
sys.path.append("..")
from config.config import (
    BUCKET_NAME,
    AWS_REGION,
    S3_PREFIX,
    RAW_DATA_FILENAME,
    FEATURE_GROUP_NAME,
    SAGEMAKER_ROLE_NAME  
)

boto_session = boto3.Session(region_name=AWS_REGION)
sagemaker_client = boto_session.client("sagemaker")
featurestore_runtime = boto_session.client("sagemaker-featurestore-runtime")
iam_client = boto_session.client("iam")

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

print("Bucket:", BUCKET_NAME)
print("Region:", AWS_REGION)

Bucket: nfci-forecasting-222634372778
Region: us-east-1


In [5]:
# Load production split from S3
prod_path = f"s3://{BUCKET_NAME}/{S3_PREFIX['production']}/features.parquet"
print("Loading:", prod_path)

df_prod = pd.read_parquet(prod_path)
df_prod["date"] = pd.to_datetime(df_prod["date"])

print(df_prod.shape)
df_prod.head()

Loading: s3://nfci-forecasting-222634372778/data/splits/production/features.parquet
(2400, 80)


Unnamed: 0,state_fips,date,UNRATE,PAYEMS,CIVPART,EMRATIO,U6RATE,AWHMAN,AHETPI,CPIAUCSL,...,UNRATE_RMEAN12,UNRATE_RSTD12,NFCI_RMEAN3,NFCI_RSTD3,NFCI_RMEAN6,NFCI_RSTD6,NFCI_RMEAN12,NFCI_RSTD12,NFCI_HORIZON6,NFCI_HORIZON12
0,1,2020-01-01,3.6,152031,63.3,61.1,7.0,41.3,23.91,259.127,...,3.675,0.13568,-0.555937,0.011606,-0.54595,0.025566,-0.560604,0.03451,-0.47596,-0.61856
1,1,2020-02-01,3.5,152292,63.3,61.1,7.0,41.5,24.03,259.25,...,3.641667,0.090034,-0.580157,0.03438,-0.553283,0.038521,-0.570093,0.033448,-0.50853,-0.62623
2,1,2020-03-01,4.4,150895,62.6,59.8,8.8,41.2,24.13,258.076,...,3.616667,0.083485,-0.4882,0.172328,-0.514615,0.114413,-0.546187,0.086446,-0.49183,-0.64756
3,1,2020-04-01,14.8,130424,60.1,51.2,22.9,38.4,25.14,256.032,...,3.666667,0.238683,-0.210283,0.455457,-0.38311,0.34478,-0.47241,0.251798,-0.5052,-0.68131
4,1,2020-05-01,13.2,133040,60.8,52.7,21.1,39.4,24.98,255.802,...,4.591667,3.223623,0.052137,0.304028,-0.26401,0.396717,-0.407293,0.307116,-0.5669,-0.69283


In [6]:
# Build Feature Store dataframe (add record_id + event_time)
# Create required columns
df_fs = df_prod.copy()

df_fs["record_id"] = df_fs["state_fips"].astype(str) + "_" + df_fs["date"].dt.strftime("%Y-%m-%d")

# Feature Store event time: use ISO string (very safe)
df_fs["event_time"] = df_fs["date"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")

# Drop targets typically not stored in FS (optional, but recommended)
drop_cols = [c for c in df_fs.columns if "HORIZON" in c]  # NFCI_HORIZON6, NFCI_HORIZON12
df_fs = df_fs.drop(columns=drop_cols)

# You can also drop the raw date if you want (since event_time already exists),
# but it's ok to keep it too.
print("After FS prep:", df_fs.shape)
df_fs.head()

After FS prep: (2400, 80)


Unnamed: 0,state_fips,date,UNRATE,PAYEMS,CIVPART,EMRATIO,U6RATE,AWHMAN,AHETPI,CPIAUCSL,...,UNRATE_RMEAN12,UNRATE_RSTD12,NFCI_RMEAN3,NFCI_RSTD3,NFCI_RMEAN6,NFCI_RSTD6,NFCI_RMEAN12,NFCI_RSTD12,record_id,event_time
0,1,2020-01-01,3.6,152031,63.3,61.1,7.0,41.3,23.91,259.127,...,3.675,0.13568,-0.555937,0.011606,-0.54595,0.025566,-0.560604,0.03451,1_2020-01-01,2020-01-01T00:00:00Z
1,1,2020-02-01,3.5,152292,63.3,61.1,7.0,41.5,24.03,259.25,...,3.641667,0.090034,-0.580157,0.03438,-0.553283,0.038521,-0.570093,0.033448,1_2020-02-01,2020-02-01T00:00:00Z
2,1,2020-03-01,4.4,150895,62.6,59.8,8.8,41.2,24.13,258.076,...,3.616667,0.083485,-0.4882,0.172328,-0.514615,0.114413,-0.546187,0.086446,1_2020-03-01,2020-03-01T00:00:00Z
3,1,2020-04-01,14.8,130424,60.1,51.2,22.9,38.4,25.14,256.032,...,3.666667,0.238683,-0.210283,0.455457,-0.38311,0.34478,-0.47241,0.251798,1_2020-04-01,2020-04-01T00:00:00Z
4,1,2020-05-01,13.2,133040,60.8,52.7,21.1,39.4,24.98,255.802,...,4.591667,3.223623,0.052137,0.304028,-0.26401,0.396717,-0.407293,0.307116,1_2020-05-01,2020-05-01T00:00:00Z


In [7]:
# Cast datatypes safely (Feature Store is picky)
def cast_for_feature_store(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    
    # object -> string
    obj_cols = df.select_dtypes(include=["object"]).columns
    if len(obj_cols) > 0:
        df[obj_cols] = df[obj_cols].astype("string")
    
    # bool -> int
    bool_cols = df.select_dtypes(include=["bool"]).columns
    if len(bool_cols) > 0:
        df[bool_cols] = df[bool_cols].astype("int64")
        
    # datetime -> drop or convert (we already have event_time string)
    dt_cols = df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns
    if len(dt_cols) > 0:
        # safest: convert datetime columns to string
        for c in dt_cols:
            df[c] = df[c].astype("string")

    return df

df_fs = cast_for_feature_store(df_fs)
df_fs.dtypes.head(20)

state_fips             int64
date          string[python]
UNRATE               float64
PAYEMS                 int64
CIVPART              float64
EMRATIO              float64
U6RATE               float64
AWHMAN               float64
AHETPI               float64
CPIAUCSL             float64
CPILFESL             float64
PCEPI                float64
PCEPILFE             float64
PPIFDG               float64
INDPRO               float64
RRSFS                float64
DGORDER              float64
UMCSENT              float64
HOUST                float64
PERMIT               float64
dtype: object

In [None]:
# Attach inline policy to the SageMaker execution role and Ensure your SageMaker execution role already has S3 read/write permissions

import json
import boto3
import sys

sys.path.append("..")
from config.config import BUCKET_NAME, AWS_REGION, S3_PREFIX


# Extract role name from ARN (needed for put_role_policy)
ROLE_NAME = ROLE_ARN.split("/")[-1]
POLICY_NAME = "nfci-featurestore-s3-inline"

bucket_arn = f"arn:aws:s3:::{BUCKET_NAME}"
# If you want scope only to offline-store prefix:
offline_prefix = S3_PREFIX["features"]  # "features"
objects_arn = f"{bucket_arn}/{offline_prefix}/*"

iam = boto3.client("iam", region_name=AWS_REGION)

policy_doc = {
    "Version": "2012-10-17",
    "Statement": [
        # Bucket-level permissions (prefix cannot be applied here)
        {
            "Sid": "AllowBucketChecksForFeatureStore",
            "Effect": "Allow",
            "Action": [
                "s3:GetBucketAcl",
                "s3:GetBucketLocation",
                "s3:ListBucket"
            ],
            "Resource": bucket_arn
        },
        # Object-level permissions (can scope to prefix)
        {
            "Sid": "AllowOfflineStoreWrites",
            "Effect": "Allow",
            "Action": [
                "s3:PutObject",
                "s3:GetObject",
                "s3:DeleteObject"
            ],
            "Resource": objects_arn
        }
    ]
}

iam.put_role_policy(
    RoleName=ROLE_NAME,
    PolicyName=POLICY_NAME,
    PolicyDocument=json.dumps(policy_doc)
)

print(" Attached inline policy:", POLICY_NAME)
print("Bucket:", BUCKET_NAME)
print("Objects scope:", objects_arn)

 Attached inline policy: nfci-featurestore-s3-inline
Bucket: nfci-forecasting-222634372778
Objects scope: arn:aws:s3:::nfci-forecasting-222634372778/features/*


#### verify the permission is fixed

In [11]:
sys.path.append("..")
from config.config import BUCKET_NAME, AWS_REGION

s3 = boto3.client("s3", region_name=AWS_REGION)

print("Testing: head_bucket...")
s3.head_bucket(Bucket=BUCKET_NAME)
print(" bucket exists")

print("Testing: get_bucket_acl...")
acl = s3.get_bucket_acl(Bucket=BUCKET_NAME)
print(" GetBucketAcl works. Owner:", acl["Owner"].get("ID", "ok"))

Testing: head_bucket...
 bucket exists
Testing: get_bucket_acl...
 GetBucketAcl works. Owner: 1a33686cfeaff4eb4ae3303289431f03ee8e498cd8e5d4dd654d488724d6e9fa


In [None]:
# Feature Group create-if-not-exists, else delete-and-recreate
import sys
import time
import boto3
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from botocore.exceptions import ClientError

sys.path.append("..")
from config.config import (
    AWS_REGION,
    BUCKET_NAME,
    S3_PREFIX,
    FEATURE_GROUP_NAME,
)


# ---- build sessions (same pattern as your lab template) ----
boto_session = boto3.Session(region_name=AWS_REGION)
sagemaker_client = boto_session.client("sagemaker", region_name=AWS_REGION)
featurestore_runtime = boto_session.client("sagemaker-featurestore-runtime", region_name=AWS_REGION)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)

# ---- offline store s3 uri (your config is correct) ----
offline_store_s3_uri = f"s3://{BUCKET_NAME}/{S3_PREFIX['features']}"
print("Offline store uri:", offline_store_s3_uri)

record_identifier_feature_name = "record_id"
event_time_feature_name = "event_time"

# ---- helper: wait until FG is deleted ----
def wait_until_deleted(fg_name: str, sm_client, poll_sec: int = 15, timeout_sec: int = 20 * 60):
    start = time.time()
    while True:
        try:
            sm_client.describe_feature_group(FeatureGroupName=fg_name)
            if time.time() - start > timeout_sec:
                raise TimeoutError(f"Timed out waiting for feature group deletion: {fg_name}")
            print("Waiting for deletion to complete...")
            time.sleep(poll_sec)
        except sm_client.exceptions.ResourceNotFound:
            print(" Feature group fully deleted.")
            return

# ---- helper: wait until created ----
def wait_until_created(fg_obj: FeatureGroup, poll_sec: int = 15, timeout_sec: int = 30 * 60):
    start = time.time()
    status = fg_obj.describe().get("FeatureGroupStatus")
    while status == "Creating":
        if time.time() - start > timeout_sec:
            raise TimeoutError(f"Timed out waiting for feature group creation: {fg_obj.name}")
        print("Waiting... status=Creating")
        time.sleep(poll_sec)
        status = fg_obj.describe().get("FeatureGroupStatus")

    if status != "Created":
        raise RuntimeError(f"Feature group ended in unexpected status: {status}")

    print(" Final status:", status)

# ---- delete if exists ----
try:
    _ = sagemaker_client.describe_feature_group(FeatureGroupName=FEATURE_GROUP_NAME)
    print(f"Feature group '{FEATURE_GROUP_NAME}' already exists → deleting it first...")

    fg_existing = FeatureGroup(name=FEATURE_GROUP_NAME, sagemaker_session=feature_store_session)
    fg_existing.delete()

    wait_until_deleted(FEATURE_GROUP_NAME, sagemaker_client)

except sagemaker_client.exceptions.ResourceNotFound:
    print(f"Feature group '{FEATURE_GROUP_NAME}' does not exist → will create a new one.")
except ClientError as e:
    # Any other AWS error (permissions, throttling, etc.)
    raise

# ---- (re)create feature group ----
fg = FeatureGroup(name=FEATURE_GROUP_NAME, sagemaker_session=feature_store_session)

# infer schema from your dataframe (df_fs must contain record_id + event_time columns!)
fg.load_feature_definitions(data_frame=df_fs)

fg.create(
    s3_uri=offline_store_s3_uri,
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=ROLE_ARN,
    enable_online_store=True,
)

print("Feature group create called:", FEATURE_GROUP_NAME)

# ---- wait until created ----
wait_until_created(fg)

Offline store uri: s3://nfci-forecasting-222634372778/features
Feature group 'nfci-feature-group' already exists → deleting it first...
Waiting for deletion to complete...
 Feature group fully deleted.
Feature group create called: nfci-feature-group
Waiting... status=Creating
Waiting... status=Creating
 Final status: Created


In [16]:
from sagemaker.feature_store.feature_group import FeatureGroup

fg = FeatureGroup(name=FEATURE_GROUP_NAME, sagemaker_session=feature_store_session)
desc = fg.describe()
print("Status:", desc["FeatureGroupStatus"])
print("Online store:", desc["OnlineStoreConfig"]["EnableOnlineStore"])
print("Offline S3:", desc["OfflineStoreConfig"]["S3StorageConfig"]["ResolvedOutputS3Uri"])

Status: Created
Online store: True
Offline S3: s3://nfci-forecasting-222634372778/features/222634372778/sagemaker/us-east-1/offline-store/nfci-feature-group-1770010610/data


In [17]:
fg.ingest(data_frame=df_fs, max_workers=4, wait=True)
print(" Ingestion complete")

 Ingestion complete


In [18]:
sample_id = str(df_fs["record_id"].iloc[0])

featurestore_runtime.get_record(
    FeatureGroupName=FEATURE_GROUP_NAME,
    RecordIdentifierValueAsString=sample_id
)

{'ResponseMetadata': {'RequestId': '4ee78aaf-6c8c-4270-afdb-e1243221ea81',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4ee78aaf-6c8c-4270-afdb-e1243221ea81',
   'content-type': 'application/json',
   'content-length': '6607',
   'date': 'Mon, 02 Feb 2026 05:39:25 GMT'},
  'RetryAttempts': 0},
 'Record': [{'FeatureName': 'state_fips', 'ValueAsString': '1'},
  {'FeatureName': 'date', 'ValueAsString': '2020-01-01'},
  {'FeatureName': 'UNRATE', 'ValueAsString': '3.6'},
  {'FeatureName': 'PAYEMS', 'ValueAsString': '152031'},
  {'FeatureName': 'CIVPART', 'ValueAsString': '63.3'},
  {'FeatureName': 'EMRATIO', 'ValueAsString': '61.1'},
  {'FeatureName': 'U6RATE', 'ValueAsString': '7.0'},
  {'FeatureName': 'AWHMAN', 'ValueAsString': '41.3'},
  {'FeatureName': 'AHETPI', 'ValueAsString': '23.91'},
  {'FeatureName': 'CPIAUCSL', 'ValueAsString': '259.127'},
  {'FeatureName': 'CPILFESL', 'ValueAsString': '266.716'},
  {'FeatureName': 'PCEPI', 'ValueAsString': '104.49'},
  {'Featu