# Syncing BLS API to S3
This script issues API requests to the BLS endpoint and stores the JSON responses in S3. 
The filenames are not hardcoded but instead created from the series IDs and the script skips the upload if nothing changed.

**[Foundation Version (first ingest to S3)](https://github.com/ScottySchmidt/AWS_DataEngineer_API/blob/main/01-ingest-apis-to-s3.ipynb)**
This version laid the foundation for the sync version.

---
### What it does
- Pulls data from the BLS API  
- Makes a filename from the series IDs  
- Compares the new data to what’s already in S3  
- Uploads only if the file has changed  

In [1]:
import boto3
import requests
import hashlib
import json
import os
from kaggle_secrets import UserSecretsClient

# LOAD AWS SECRETS:
secrets = UserSecretsClient()
API_KEY = secrets.get_secret("BLS_API_KEY")
AWS_ACCESS_KEY_ID = secrets.get_secret("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = secrets.get_secret("AWS_SECRET_ACCESS_KEY")
AWS_REGION = secrets.get_secret("AWS_REGION")
BUCKET_NAME = secrets.get_secret("BUCKET_NAME")
SERIES_IDS = secrets.get_secret("SERIES_IDS")  # string

# Setup AWS session and S3
session = boto3.Session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION
)
s3 = session.client("s3")

# Test connection WITHOUT revealing keys
try:
    response = s3.list_objects_v2(Bucket=BUCKET_NAME)
    num_files = response.get('KeyCount', 0)
    print("S3 connection successful. Bucket contains: ", num_files)
except Exception as e:
    print("S3 connection failed: ", e)

# Series IDs we ask BLS for (used to build filename too)
SERIES_IDS = [s.strip() for s in SERIES_IDS.split(",") if s.strip()]

filename = f"bls_{'-'.join(sorted(SERIES_IDS))}.json"   # ← deterministic, no hardcode
s3_key   = f"bls/api/{filename}"

print("filename:", filename)
print("s3 key:", s3_key)

# Required request headers for BLS
headers = {
    "Content-Type": "application/json",
    "User-Agent": os.getenv("USER_AGENT", "ScottSchmidt/1.0 (email)")
}

# API request parameters
payload = {
    "seriesid": SERIES_IDS,
    "registrationkey": API_KEY
}

## Send request to BLS API
resp = requests.post(
    "https://api.bls.gov/publicAPI/v2/timeseries/data/",
    data=json.dumps(payload),
    headers=headers,
    timeout=60
)

# Get API response as JSON
if resp.status_code != 200:
    raise RuntimeError(f"BLS error {resp.status_code}: {resp.text[:200]}")
data = resp.json()
print("got data")

with open(filename, "w") as f:
    json.dump(data, f, indent=2)
print("saved:", filename)

# Add json data to S3 bucket:
s3.put_object(Bucket=BUCKET_NAME, Key=s3_key, Body=json.dumps(data, indent=2))
print("uploaded:", s3_key)

S3 connection successful. Bucket contains:  45
filename: bls_CEU0000000001-CUUR0000SA0-SUUR0000SA0.json
s3 key: bls/api/bls_CEU0000000001-CUUR0000SA0-SUUR0000SA0.json
got data
saved: bls_CEU0000000001-CUUR0000SA0-SUUR0000SA0.json
uploaded: bls/api/bls_CEU0000000001-CUUR0000SA0-SUUR0000SA0.json


In [2]:
current_files = []

for group in SERIES_IDS:
    ids = [s.strip() for s in group.split(",") if s.strip()]
    filename = f"bls_{'-'.join(sorted(ids))}.json"
    s3_key   = f"bls/api/{filename}"

    # make request (same as before)
    payload = {"seriesid": ids, "registrationkey": API_KEY}
    resp = requests.post("https://api.bls.gov/publicAPI/v2/timeseries/data/",
                         data=json.dumps(payload), headers=headers, timeout=60)
    data = resp.json()

    # save & upload
    with open(filename, "w") as f:
        json.dump(data, f, indent=2)
    s3.put_object(Bucket=BUCKET_NAME, Key=s3_key, Body=json.dumps(data, indent=2))

    # keep track of what we just wrote
    current_files.append(filename)

print("files this run:", current_files)

files this run: ['bls_CUUR0000SA0.json', 'bls_SUUR0000SA0.json', 'bls_CEU0000000001.json']


In [3]:
# Sanity check S3 file
obj = s3.get_object(Bucket=BUCKET_NAME, Key=s3_key)
print("bytes:", obj["ContentLength"])
print("preview:", obj["Body"].read(300).decode("utf-8")[:300])

bytes: 6806
preview: {
  "status": "REQUEST_SUCCEEDED",
  "responseTime": 133,
  "message": [],
  "Results": {
    "series": [
      {
        "seriesID": "CEU0000000001",
        "data": [
          {
            "year": "2025",
            "period": "M07",
            "periodName": "July",
            "latest": "true"


In [4]:
# CHECK FOR CHANGES:
import hashlib, botocore

#HASH THE DATA
def md5(s: str): return hashlib.md5(s.encode("utf-8")).hexdigest()

body = json.dumps(data, indent=2)
try:
    etag = s3.head_object(Bucket=BUCKET_NAME, Key=s3_key)["ETag"].strip('"')
except botocore.exceptions.ClientError:
    etag = None

#Data is the same:
if etag == md5(body):
    print("no changes; skipped")
else:
    s3.put_object(Bucket=BUCKET_NAME, Key=s3_key, Body=body)
    print("uploaded:", s3_key)

no changes; skipped


In [5]:
# cleanup: remove files in S3 that no longer exist upstream
print("Checking For Deletes...")

# list current source file names
src_files = set(current_files)

BLS_PREFIX = os.getenv("BLS_PREFIX", "bls/api/")

# list keys in S3 under your prefix
resp = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=BLS_PREFIX)
s3_files = set(obj["Key"].split("/")[-1] for obj in resp.get("Contents", []))

# find extras in S3 not in source
to_delete = s3_files - src_files

count = 0
for name in sorted(to_delete):
    key = f"{BLS_PREFIX}{name}"
    print("delete:", key)
    s3.delete_object(Bucket=BUCKET_NAME, Key=key)
    count = count+1
print("Done. Deleted this many files: ", count)

Checking For Deletes...
delete: bls/api/bls_CEU0000000001-CUUR0000SA0-SUUR0000SA0.json
Done. Deleted this many files:  1


##  Preview Synced BLS Data
Load the full set of BLS JSON files from S3 (kept in sync with the source) into a single DataFrame for analysis.

In [6]:
# VIEW DATA AS A DATAFRAME:
import pandas as pd

# pull first series only
series = data["Results"]["series"][0]
sid    = series["seriesID"]

df = pd.DataFrame(series["data"])
df["seriesID"] = sid

print(df.shape)
df.head(32)   # first 10 rows

(31, 7)


Unnamed: 0,year,period,periodName,latest,value,footnotes,seriesID
0,2025,M07,July,True,159227,"[{'code': 'P', 'text': 'preliminary'}]",CEU0000000001
1,2025,M06,June,,160293,"[{'code': 'P', 'text': 'preliminary'}]",CEU0000000001
2,2025,M05,May,,159930,[{}],CEU0000000001
3,2025,M04,April,,159227,[{}],CEU0000000001
4,2025,M03,March,,158402,[{}],CEU0000000001
5,2025,M02,February,,157944,[{}],CEU0000000001
6,2025,M01,January,,157095,[{}],CEU0000000001
7,2024,M12,December,,159923,[{}],CEU0000000001
8,2024,M11,November,,159882,[{}],CEU0000000001
9,2024,M10,October,,159352,[{}],CEU0000000001
