In [3]:
import time
from typing import List, Tuple
import yfinance as yf
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pyarrow import Table
from pyarrow.parquet import write_table
from requests.exceptions import RequestException, Timeout
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
from selenium.webdriver.common.by import By
import pandas_datareader.data as web
from datetime import datetime

## 1. Scraping US Central Bank Announcements

In [9]:
def get_speech_metadata(url: str,
                        wait_time: int,
                        page_limit: int) -> Tuple[List[str], List[str], List[str], List[str], List[str]]:
    """

    """

    # Selenium initialization
    driver = webdriver.Chrome()

    # Open a webpage (necessary to load JavaScript content)
    driver.get(url)
    driver.implicitly_wait(wait_time)

    # Going to the next page
    event_dates, titles, urls, speakers, locations = [], [], [], [], []
    page_count = 1

    while page_count <= page_limit:
        # Get/parse data
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')

        # Get lists with tags
        events = soup.find_all('div', class_='col-xs-9 col-md-10 eventlist__event')
        dates = soup.find_all('div', class_='col-xs-3 col-md-2 eventlist__time')

        # Log page processing
        print(f'* Processing page # {page_count} / {page_limit}')

        # Extract data from tags
        for event, date in zip(events, dates):
            # Get date
            date_tag = date.find('time', class_='itemDate ng-binding')
            dmy = date_tag.text.strip() if date_tag else 'N/A'

            # Get event metadata
            title_tag = event.find('a', class_='ng-binding')
            title = title_tag.text.strip() if title_tag else 'N/A'
            url = title_tag['href'] if title_tag and 'href' in title_tag.attrs else 'N/A'

            speaker_tag = event.find('p', class_='news__speaker ng-binding')
            speaker = speaker_tag.text.strip() if speaker_tag else 'N/A'

            location_tag = event.find('p', class_='result__location ng-binding')
            location = location_tag.text.strip() if location_tag else 'N/A'

            # Append data tags
            event_dates.append(dmy)
            titles.append(title)
            urls.append(url)
            speakers.append(speaker)
            locations.append(location)

        # Go to the next page
        try:
            next_button = driver.find_element(By.LINK_TEXT, 'Next')
            driver.execute_script('arguments[0].click();', next_button)  # next_button.click()
            time.sleep(2)
            page_count += 1
        except (NoSuchElementException, ElementNotInteractableException):
            print('No 'Next' button was found')

        # Log item processing per page
        print(f'    * Processed {len(titles)} items...')

    # Close the page
    driver.quit()

    return event_dates, titles, urls, speakers, locations


def get_speech_text(url_list: list, wait_time: int) -> list:
    """

    """

    speech_content = []

    for idx, url in enumerate(url_list):
        try:
            # Get data
            url = f'https://www.federalreserve.gov/{url}'
            page = requests.get(url, timeout=wait_time)

            # Validate the request
            page.raise_for_status()

            # Parse data
            soup = BeautifulSoup(page.content, 'html.parser')

            # Get speech text
            speech = soup.find_all('div', {'class': 'col-xs-12 col-sm-8 col-md-8'})[0].text
            speech_content.append(speech)

            # Log speech processing
            if idx % 100 == 0:
                print(f'* Processing speech # {idx} / {len(url_list)}')
        except Timeout:
            print('The request timed out')
        except RequestException as e:
            print(f'An error occurred: {e}')
        except ValueError as e:
            print(f'Content error: {e}')

    return speech_content

In [10]:
url = 'https://www.federalreserve.gov/newsevents/speeches.htm'
speech_dates, speech_titles, speech_urls, speech_speakers, _ = get_speech_metadata(url=url, wait_time=10, page_limit=57)

* Processing page # 1 / 57
    * Processed 0 items...
* Processing page # 2 / 57
    * Processed 20 items...
* Processing page # 3 / 57
    * Processed 40 items...
* Processing page # 4 / 57
    * Processed 60 items...
* Processing page # 5 / 57
    * Processed 80 items...
* Processing page # 6 / 57
    * Processed 100 items...
* Processing page # 7 / 57
    * Processed 120 items...
* Processing page # 8 / 57
    * Processed 140 items...
* Processing page # 9 / 57
    * Processed 160 items...
* Processing page # 10 / 57
    * Processed 180 items...
* Processing page # 11 / 57
    * Processed 200 items...
* Processing page # 12 / 57
    * Processed 220 items...
* Processing page # 13 / 57
    * Processed 240 items...
* Processing page # 14 / 57
    * Processed 260 items...
* Processing page # 15 / 57
    * Processed 280 items...
* Processing page # 16 / 57
    * Processed 300 items...
* Processing page # 17 / 57
    * Processed 320 items...
* Processing page # 18 / 57
    * Processed 34

In [11]:
speech_text = get_speech_text(url_list=speech_urls, wait_time=10)

speech_us_central_bank = {
    'speech_date': speech_dates,
    'speech_title': speech_titles,
    'speech_url': speech_urls,
    'speech_speaker': speech_speakers,
    'speech_text': speech_text
}

* Processing speech # 0 / 1113
* Processing speech # 100 / 1113
* Processing speech # 200 / 1113
* Processing speech # 300 / 1113
* Processing speech # 400 / 1113
* Processing speech # 500 / 1113
* Processing speech # 600 / 1113
* Processing speech # 700 / 1113
* Processing speech # 800 / 1113
* Processing speech # 900 / 1113
* Processing speech # 1000 / 1113
* Processing speech # 1100 / 1113


In [None]:
with open('.\\data\\speech_us_central_bank.parquet', 'wb') as handle:
    write_table(Table.from_pandas(pd.DataFrame(speech_us_central_bank)), handle, compression='GZIP')

## 2. Macroeconomic Indicators

In [15]:
# Date range
start_date = datetime(2006, 1, 1)
end_date = datetime(2024, 12, 31)

# 1. Macroeconomic Activity and Growth
gdp_data = web.DataReader('GDP', 'fred', start_date, end_date)  # US GDP (quarterly)
unemployment_data = web.DataReader('UNRATE', 'fred', start_date, end_date)  # US unemployment rate (monthly)

# 2. Inflation and Price Levels
cpi_data = web.DataReader('CPIAUCSL', 'fred', start_date, end_date)  # US Consumer Price Index (CPI), i.e. a measure of inflation (monthly)

# 3. Monetary Policy and Interest Rates
treasury_yield_data = yf.download('^TNX', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': '10Y_Treasury_Yield'})  # US 10-Year Treasury Yield, i.e. debt obligations (daily)
fed_funds_rate = web.DataReader('FEDFUNDS', 'fred', start_date, end_date)  # Federal Funds Rate (daily)

# 4. Market Sentiment and Risk
vix_data = yf.download('^VIX', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': 'VIX'})  # US Volatility Index, i.e. 'Fear Gauge' (daily)
consumer_sentiment = web.DataReader('UMCSENT', 'fred', start_date, end_date)  # Consumer Sentiment (monthly)

# 5. Currency and Exchange Rate
dxy_data = yf.download('DX-Y.NYB', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': 'DXY'})  # US Dollar Index (DXY) (daily)

# 6. Housing Market Data
home_price_index = web.DataReader('CSUSHPINSA', 'fred', start_date, end_date)  # US Housing Price Index (monthly)

# Commodities Prices (daily)
oil_data = yf.download('CL=F', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': 'Oil'})
gold_data = yf.download('GC=F', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': 'Gold'})
natural_gas_data = yf.download('NG=F', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': 'Natural_Gas'})
copper_data = yf.download('HG=F', start=start_date, end=end_date, interval='1d')[['Close']].rename(columns={'Close': 'Copper'})

# Merge all datasets
merged_data = (treasury_yield_data \
    .merge(gdp_data, how='outer', left_index=True, right_index=True)
    .merge(unemployment_data, how='outer', left_index=True, right_index=True)
    .merge(cpi_data, how='outer', left_index=True, right_index=True)
    .merge(fed_funds_rate, how='outer', left_index=True, right_index=True)
    .merge(vix_data, how='outer', left_index=True, right_index=True)
    .merge(consumer_sentiment, how='outer', left_index=True, right_index=True)
    .merge(dxy_data, how='outer', left_index=True, right_index=True)
    .merge(home_price_index , how='outer', left_index=True, right_index=True)
    .merge(oil_data , how='outer', left_index=True, right_index=True)
    .merge(gold_data , how='outer', left_index=True, right_index=True)
    .merge(natural_gas_data , how='outer', left_index=True, right_index=True)
    .merge(copper_data , how='outer', left_index=True, right_index=True))

# Forward-filling NaNs
merged_data = merged_data.fillna(method='ffill').dropna().reset_index().rename(columns={'index': 'date'})

# Recover weekends
date_range = pd.date_range(start=merged_data['date'].min(), end=merged_data['date'].max())
merged_data.set_index('date', inplace=True)
merged_data = merged_data.reindex(date_range)
merged_data.index.name = 'date'
merged_data.reset_index(inplace=True)

# Fill missing data
merged_data.fillna(method='ffill', inplace=True)

[*********************100%%**********************]  1 of 1 completed

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')



[*********************100%%**********************]  1 of 1 completed

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')



[*********************100%%**********************]  1 of 1 completed

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')



[*********************100%%**********************]  1 of 1 completed

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')



[*********************100%%**********************]  1 of 1 completed

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')



[*********************100%%**********************]  1 of 1 completed

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')



[*********************100%%**********************]  1 of 1 completed


  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  merged_data = merged_data.fillna(method='ffill').dropna().reset_index().rename(columns={'index': 'date'})
  merged_data.fillna(method='ffill', inplace=True)


In [18]:
with open('..\\data\\us_macroeconomic_indicators.parquet', 'wb') as handle:
    write_table(Table.from_pandas(merged_data), handle, compression='GZIP')