# Notebook 01: S3 and Data Lake Setup

- Create an S3 bucket for the data lake


In [33]:
# import libraries
import os
import sys

import boto3
from botocore.exceptions import ClientError

In [34]:
# load config to path so we can import it
sys.path.append("..")
from config.config import (
    BUCKET_NAME,
    AWS_REGION,
    S3_PREFIX,
    RAW_DATA_FILENAME,
    get_s3_uri,
)

print(f"Bucket Name: {BUCKET_NAME}")
print(f"Region: {AWS_REGION}")

Bucket Name: nfci-forecasting-306617143793
Region: us-east-1


- create bucket

In [35]:
# Create S3 client
s3_client = boto3.client("s3", region_name=AWS_REGION)

# create S3 resource object
s3_resource = boto3.resource("s3", region_name=AWS_REGION)

# Create S3 bucket if it doesn't exist
try:
    s3_client.head_bucket(Bucket=BUCKET_NAME)
    print(f"Bucket '{BUCKET_NAME}' already exists.")
except ClientError:
    # The bucket does not exist, create it.
    s3_client.create_bucket(
        Bucket=BUCKET_NAME,
    )
    print(f"Bucket '{BUCKET_NAME}' created.")

Bucket 'nfci-forecasting-306617143793' already exists.


- enable bucket versioning

In [36]:
s3_client.put_bucket_versioning(
    Bucket=BUCKET_NAME,
    VersioningConfiguration={"Status": "Enabled"}
)

print(f"✓ Versioning enabled for '{BUCKET_NAME}'")

✓ Versioning enabled for 'nfci-forecasting-306617143793'


### Create folder structure for data organization

In [37]:
for prefix_name, prefix_path in S3_PREFIX.items():
    # Ensure prefix ends with /
    folder_key = prefix_path if prefix_path.endswith("/") else f"{prefix_path}/"
    
    # Create empty object to represent folder
    s3_client.put_object(
        Bucket=BUCKET_NAME,
        Key=folder_key,
        Body=b""  # Empty content
    )
    print(f"  ✓ Created: {folder_key}")

  ✓ Created: data/raw/
  ✓ Created: data/cleaned/
  ✓ Created: data/splits/
  ✓ Created: data/splits/train/
  ✓ Created: data/splits/validation/
  ✓ Created: data/splits/test/
  ✓ Created: data/splits/production/
  ✓ Created: features/
  ✓ Created: models/artifacts/
  ✓ Created: models/baselines/
  ✓ Created: forecasts/nfci_predictions/
  ✓ Created: monitoring/data_quality/
  ✓ Created: monitoring/model_quality/
  ✓ Created: athena-results/


## upload raw data

In [38]:
LOCAL_DATA_PATH = "../data/state_month_full.csv"

# Verify file exists
if os.path.exists(LOCAL_DATA_PATH):
    file_size_mb = os.path.getsize(LOCAL_DATA_PATH) / (1024 * 1024)
    print(f"Found data file: {LOCAL_DATA_PATH}")
    print(f"  Size: {file_size_mb:.2f} MB")
else:
    print(f"File not found: {LOCAL_DATA_PATH}")
    print("update LOCAL_DATA_PATH to point to data file")

Found data file: ../data/state_month_full.csv
  Size: 3.45 MB


In [39]:
def upload_file_to_s3(local_path, bucket, s3_key):
    """Upload a file to an S3 bucket."""
    try:
        s3_client.upload_file(local_path, bucket, s3_key)
        print(f"Uploaded: s3://{bucket}/{s3_key}")
        return True
    except ClientError as e:
        print(f"Upload failed: {e}")
        return False

In [40]:
raw_data_s3_key = f"{S3_PREFIX['raw']}/{RAW_DATA_FILENAME}"

print(f"Uploading to: s3://{BUCKET_NAME}/{raw_data_s3_key}")

# Upload the file
upload_file_to_s3(LOCAL_DATA_PATH, BUCKET_NAME, raw_data_s3_key)

Uploading to: s3://nfci-forecasting-306617143793/data/raw/state_month_full.csv
Uploaded: s3://nfci-forecasting-306617143793/data/raw/state_month_full.csv


True

- verify data content uploaded correctly

In [42]:
import pandas as pd

# Read directly from S3
s3_uri = get_s3_uri("raw", RAW_DATA_FILENAME)
print(f"Reading from: {s3_uri}\n")

# Read just the first 5 rows to verify
df_raw_data = pd.read_csv(s3_uri, nrows=5)
print(f"Shape: {df_raw_data.shape}")
print(f"Columns: {list(df_raw_data.columns)}\n")
df_raw_data.head()

Reading from: s3://nfci-forecasting-306617143793/data/raw/state_month_full.csv

Shape: (5, 42)
Columns: ['state_fips', 'date', 'UNRATE', 'PAYEMS', 'CIVPART', 'EMRATIO', 'U6RATE', 'AWHMAN', 'AHETPI', 'CPIAUCSL', 'CPILFESL', 'PCEPI', 'PCEPILFE', 'PPIFDG', 'INDPRO', 'RRSFS', 'DGORDER', 'UMCSENT', 'HOUST', 'PERMIT', 'CSUSHPINSA', 'MSPUS', 'FEDFUNDS', 'DGS3MO', 'DGS2', 'DGS10', 'MORTGAGE30US', 'BAA', 'AAA', 'BAMLH0A0HYM2', 'NFCI', 'M2SL', 'WALCL', 'SPREAD_10Y_2Y', 'SPREAD_10Y_3M', 'CPI_YOY', 'state_name', 'B19013_001E', 'B01003_001E', 'B25077_001E', 'B25064_001E', 'B17001_002E']



Unnamed: 0,state_fips,date,UNRATE,PAYEMS,CIVPART,EMRATIO,U6RATE,AWHMAN,AHETPI,CPIAUCSL,...,WALCL,SPREAD_10Y_2Y,SPREAD_10Y_3M,CPI_YOY,state_name,B19013_001E,B01003_001E,B25077_001E,B25064_001E,B17001_002E
0,1,2005-01-01,5.3,132781,65.8,62.4,9.2,40.7,15.9,191.6,...,807262.0,0.85,1.63,,,42081.0,4712651.0,117600.0,644.0,786544.0
1,1,2005-02-01,5.4,133033,65.9,62.4,9.2,40.7,15.93,192.4,...,804576.0,0.77,1.6,,,42081.0,4712651.0,117600.0,644.0,786544.0
2,1,2005-03-01,5.2,133152,65.9,62.4,9.1,40.4,15.97,193.1,...,807551.0,0.7,1.71,,,42081.0,4712651.0,117600.0,644.0,786544.0
3,1,2005-04-01,5.2,133519,66.1,62.7,9.0,40.4,16.01,193.7,...,809797.0,0.55,1.31,,,42081.0,4712651.0,117600.0,644.0,786544.0
4,1,2005-05-01,5.1,133689,66.1,62.8,8.9,40.4,16.03,193.6,...,810160.0,0.4,1.01,,,42081.0,4712651.0,117600.0,644.0,786544.0
