## Pull FRED data

In [None]:
import fredapi as fred
import pandas as pd
import os
import requests
import json
import time

API_KEY = os.getenv("FRED_API_KEY") or "YOUR_FRED_API_KEY"
BASE = "https://api.stlouisfed.org"

# Initialise client
cli=fred.Fred(api_key=os.getenv("FRED_API_KEY", ''))


In [6]:
prime_df = walk_categories(root_id=0, max_depth=0)  # depth 0 => direct children
print(prime_df[["id","name"]])

      id                                     name
0  32991                Money, Banking, & Finance
1     10  Population, Employment, & Labor Markets
2  32992                        National Accounts
3      1           Production & Business Activity
4  32455                                   Prices
5  32263                       International Data
6   3008                       U.S. Regional Data
7  33060                            Academic Data


In [19]:
# pip install pandas requests python-dateutil
import os, time, requests, pandas as pd
from dateutil.relativedelta import relativedelta
START, END = "1980-01-01", "2019-12-31"  # data range of interest
DEF_MIN_REALTIME, DEF_MAX_REALTIME='1776-07-04', '9999-12-31'  # default FRED realtime range
TITLE_EXCLUDE_RX = (
    "forecast|projection|expectation|outlook|estimate|nowcast|"
    "discontinued|lag|lead|advance|future"
)

def _get(path, **params):
    params.update({"api_key": API_KEY, 'file_type': "json", **params})
    r = requests.get(f"{BASE}{path}", params=params, timeout=30)
    r.raise_for_status()
    return r.json()

# ---------- Category traversal ----------
def get_category_children(category_id: int):
    js = _get("/fred/category/children", category_id=category_id)
    return js.get("categories", [])

def get_category_series(category_id: int, realtime_start='1980-01-01', limit=1000, sleep=1.0):
    out, offset = [], 0
    while True:
        js = _get("/fred/category/series", category_id=category_id,
                  limit=limit, offset=offset, order_by="series_id",
                  realtime_start=realtime_start)
        print(f'{js.get("count",0)} series so far...', end='\r')
        ser = js.get("seriess", [])
        out.extend(ser)
        if len(ser) < limit: break
        offset += limit
        time.sleep(sleep)
    return out

def prime_categories():
    """Children of category 0 (the 8 top buckets)."""
    kids = get_category_children(0)
    return pd.DataFrame(kids)[["id","name"]].sort_values("name").reset_index(drop=True)
# ---------- Category traversal (recursive/iterative) ----------
def iter_category_ids(root_id: int):
    """Depth-first walk yielding root and all descendant category IDs."""
    stack, seen = [root_id], set()
    while stack:
        cid = stack.pop()
        if cid in seen:
            continue
        seen.add(cid)
        yield cid
        for kid in get_category_children(cid):
            stack.append(kid["id"])

def get_all_series_under_category(root_id: int, realtime_start='1980-01-01', limit=1000, sleep=0.1):
    """Collect series metadata for root category and all descendants; de-duplicate by series_id."""
    series_by_id = {}
    for cid in iter_category_ids(root_id):
        for m in get_category_series(cid, realtime_start=realtime_start, limit=limit, sleep=sleep):
            sid = m["id"]
            if sid not in series_by_id:   # keep first seen; avoids duplicates across multiple cats
                series_by_id[sid] = m
    return list(series_by_id.values())

# ---------- First release (ALFRED) ----------
def first_release_date_for_series(series_id: str) -> str | None:
    """Return the earliest release date for the series' release (YYYY-MM-DD)."""
    rel = _get("/fred/series/release", series_id=series_id).get("releases", [])
    if not rel: return None
    rel_id = rel[0]["id"]
    dates = _get("/fred/release/dates", release_id=rel_id).get("release_dates", [])
    if not dates: return None
    return dates[-1]['date']  # earliest vintage date

# ---------- Observations helpers ----------
def _obs_to_series(js, series_id: str, freq: str) -> pd.Series:
    obs = pd.DataFrame(js.get("observations", []))
    if obs.empty: return pd.Series(dtype="float64", name=series_id)
    obs["date"] = pd.to_datetime(obs["date"])
    obs["value"] = pd.to_numeric(obs["value"].replace(".", pd.NA), errors="coerce")
    s = obs.set_index("date")["value"]
    return s.rename(series_id)

def fetch_first_vintage_monthly(series_id: str, series_freq_short: str) -> pd.Series:
    """
    Pull earliest-vintage data and return MONTHLY series converted if needed:
      - D/W -> monthly via FRED server-side frequency='m', aggregation_method='avg'
      - M   -> monthly as-is
    """
    first_vintage = first_release_date_for_series(series_id)
    if not first_vintage:
        return pd.Series(dtype="float64", name=series_id)

    params_common = dict(
        series_id=series_id,
        realtime_start=first_vintage, realtime_end=first_vintage,
        observation_start=START, observation_end=END
    )

    if series_freq_short in ("D", "W"):  # higher-than-monthly -> aggregate to monthly
        js = _get("/fred/series/observations", **params_common,
                  frequency="m", aggregation_method="avg")
        return _obs_to_series(js, series_id, "M")

    elif series_freq_short == "M":
        js = _get("/fred/series/observations", **params_common)
        return _obs_to_series(js, series_id, "M")

    else:
        # Unknown/other frequencies: try FRED's monthly conversion
        js = _get("/fred/series/observations", **params_common,
                  frequency="m", aggregation_method="avg")
        return _obs_to_series(js, series_id, "M")


# ---------- Filters ----------
def meta_passes_title_filters(meta: dict) -> bool:
    title = (meta.get("title") or "")
    return pd.Series(title).str.contains(TITLE_EXCLUDE_RX, case=False, regex=True).iloc[0] == False

def is_discontinued(meta: dict) -> bool:
    title = (meta.get("title") or "")
    return "discontinued" in title.lower()

def keep_by_frequency(meta: dict) -> bool:
    # Accept monthly or quarterly as-is; allow D/W to be aggregated to monthly; skip others if you prefer.
    f = (meta.get("frequency_short") or "").upper()
    return f in ("M","D","W")

def series_is_complete_monthly(s: pd.Series) -> bool:
    """Return True if the Series has no missing values."""
    # print na count
    return (not s.empty) and s.notna().all()

# ---------- Pipeline (one top category as example) ----------
def build_category_panel(cat_id: int, include_descendants: bool = True, rate_sleep=0.1):
    # 1) gather metas
    if include_descendants:
        meta_list = get_all_series_under_category(cat_id, realtime_start=START, sleep=rate_sleep)
    else:
        meta_list = get_category_series(cat_id, realtime_start=START, sleep=rate_sleep)

    print(f"Meta candidates before filters: {len(meta_list)}")
    # 2) apply metadata filters
    meta_list = [m for m in meta_list
                 if keep_by_frequency(m) and meta_passes_title_filters(m) and not is_discontinued(m)]

    kept = []
    for m in meta_list:
        sid = m["id"]
        fshort = (m.get("frequency_short") or "").upper()
        try:
            s = fetch_first_vintage_monthly(sid, fshort)
            if series_is_complete_monthly(s):
                kept.append(s)
        except requests.HTTPError:
            pass
        time.sleep(rate_sleep)  # be nice to API

    return (pd.concat(kept, axis=1)
            if kept else pd.DataFrame(index=pd.period_range("1980-01","2019-12",freq="M").to_timestamp("M")))

if __name__ == "__main__":
    # 1) list 8 prime categories
    primes = prime_categories()
    print(primes)

    # Do for all categories if you want, but takes a long time
    for _, row in primes.iterrows():
        cid, cname = row["id"], row["name"]
        print(f"Category {cid} - {cname}:")
        panel = build_category_panel(cid)
        print(f"  Panel shape: {panel.shape}")
        panel.to_csv(f"fred_category_{cid}_{cname.lower().replace(' ','_')}_first_vintage_monthly_1980_2019.csv")

      id                                     name
0  33060                            Academic Data
1  32263                       International Data
2  32991                Money, Banking, & Finance
3  32992                        National Accounts
4     10  Population, Employment, & Labor Markets
5  32455                                   Prices
6      1           Production & Business Activity
7   3008                       U.S. Regional Data
Category 33060 - Academic Data:
Meta candidates before filters: 16551


ConnectionError: HTTPSConnectionPool(host='api.stlouisfed.org', port=443): Max retries exceeded with url: /fred/series/release?series_id=M0473AUS000NYM267NNBR&api_key=d77dedb683b9cf13f55c0afcfe37a2da&file_type=json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A039FE2360>: Failed to resolve 'api.stlouisfed.org' ([Errno 11001] getaddrinfo failed)"))

In [None]:
# Fetch US Unemployment Rate (series ID: UNRATE)
unemployment = cli.get_series('UNRATE', observation_start='1980-01-01', observation_end='2019-12-31')

unemployment = unemployment.to_frame(name='unemployment_rate')
unemployment.reset_index(inplace=True)


In [5]:
unemployment.rename(columns={'index': 'date'}, inplace=True)

In [6]:
unemployment.to_csv('unemployment_rate_daily.csv', index=False)

In [None]:
# Get monthly data
unemployment_monthly = cli.get_series('UNRATE', observation_start='1980-01-01', observation_end='2019-12-31', frequency='m')
unemployment_monthly = unemployment_monthly.to_frame(name='unemployment_rate')
unemployment_monthly.reset_index(inplace=True)
unemployment_monthly.rename(columns={'index': 'month'}, inplace=True)

unemployment_monthly.to_csv('unemployment_rate_monthly.csv', index=False)