<a href="https://colab.research.google.com/github/VarunBanka/fed-sentiment-analyzer/blob/main/fed_main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import sys, os
sys.path.append(os.path.abspath(".."))

In [15]:
# Download some data from kaggle

import kagglehub
path = kagglehub.dataset_download("drlexus/fed-statements-and-minutes")

# Move it to /data/raw

import os
import shutil
from pathlib import Path

# Kaggle cache location
source_dir = Path("/root/.cache/kagglehub/datasets/drlexus/fed-statements-and-minutes/versions/3")

dest_dir = Path("/content/data/raw")
dest_dir.mkdir(parents=True, exist_ok=True)

csv_files = list(source_dir.glob("*.csv"))

if not csv_files:
    raise FileNotFoundError("No CSV files found in Kaggle cache directory.")

for file in csv_files:
    dest_file = dest_dir / file.name
    shutil.copy(file, dest_file)
    print(f"Copied {file.name} → {dest_file}")

print("Done.")

Using Colab cache for faster access to the 'fed-statements-and-minutes' dataset.
Copied Fed_Scrape-2015-2023.csv → /content/data/raw/Fed_Scrape-2015-2023.csv
Done.


In [17]:
# clean the dataset

import pandas as pd
df = pd.read_csv("/content/data/raw/Fed_Scrape-2015-2023.csv")

# Normalize column names
df.columns = df.columns.str.lower()
df.head()

grouped = (
    df.groupby(["date", "type"])["text"]
    .apply(lambda x: " ".join(x.astype(str)))
    .reset_index()
)

print("Number of reconstructed documents:", len(grouped))
grouped.head()

def clean_text(text):
    text = text.replace("\n", " ")
    text = " ".join(text.split())  # remove extra spaces
    return text

grouped["text"] = grouped["text"].apply(clean_text)

grouped["doc_type"] = grouped["type"].map({
    0: "statement",
    1: "minutes"
})

grouped = grouped.drop(columns=["type"])
grouped.head()

grouped.to_csv("/content/data/processed/fomc_processed.csv", index=False)

Number of reconstructed documents: 378


In [19]:
grouped.iloc[0]["text"][:500]

'The Federal Reserve Board and the Federal Open Market Committee on Wednesday released the attached minutes of the Committee meeting held on December 16-17, 2014. A summary of economic projections made by Federal Reserve Board members and Reserve Bank presidents for the meeting is also included as an addendum to these minutes. The minutes for each regularly scheduled meeting of the Committee ordinarily are made available three weeks after the day of the policy decision and subsequently are publis'

In [23]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path

RSS_URL = "https://www.federalreserve.gov/feeds/speeches.xml"
OUTPUT_FILE = Path("data/raw/fed_speeches_raw.csv")


def scrape_full_speech(link):
    page = requests.get(link)
    soup = BeautifulSoup(page.text, "html.parser")

    containers = [
        soup.find("div", id="article"),
        soup.find("div", class_="col-xs-12 col-sm-8 col-md-8"),
        soup.find("article")
    ]

    for container in containers:
        if container:
            paragraphs = container.find_all("p")
            text = " ".join([p.get_text(strip=True) for p in paragraphs])
            if len(text) > 1000:
                return text

    return ""


def scrape_speeches_from_rss():
    print("Fetching RSS feed...")
    r = requests.get(RSS_URL)
    soup = BeautifulSoup(r.content, "xml")

    items = soup.find_all("item")
    print(f"Found {len(items)} speeches")

    records = []

    for item in items:
        link = item.link.text
        date = item.pubDate.text

        print("Scraping:", link)

        text = scrape_full_speech(link)

        if text:
            records.append({
                "date": date,
                "text": text,
                "doc_type": "speech",
                "source_url": link
            })

    df = pd.DataFrame(records)
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUTPUT_FILE, index=False)

    print(f"Saved {len(df)} speeches")


if __name__ == "__main__":
    scrape_speeches_from_rss()


Fetching RSS feed...
Found 15 speeches
Scraping: https://www.federalreserve.gov/newsevents/speech/jefferson20260206a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/cook20260204a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/bowman20260130a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/waller20260130a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/jefferson20260116a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/bowman20260116a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/miran20260114a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/powell20260111a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/bowman20260107a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/miran20251215a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/powell20251201a.htm
Scraping: https://www.federalreserve.gov/newsevents/speech/jefferson20251121a.htm
Scraping