In [1]:
import requests
import os
import PyPDF2
import re
import nltk
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import yfinance as yf 

from nltk.tokenize import sent_tokenize
# Download the sentence tokenizer model if you haven't already
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Download the Transcripts of Press Conference PDF files from FOMC

I pasted the links to download in a 'transcript_links.txt' file and iterate through them to download the pdfs for the Press Releases (there is only PDF versions available on the website)

I have obtained a total of 36 documents of Press Realeases from the FOMC from 2018~2022

Meeting Calendars, statements and Minutes (2018-2024):

https://www.federalreserve.gov/monetarypolicy/fomccalendars.htm


Tanscripts and other historical materials: 

https://www.federalreserve.gov/monetarypolicy/fomc_historical.htm

Example site:

https://www.federalreserve.gov/monetarypolicy/fomcpresconf20230201.htm

In [69]:
# Create a directory to store the downloaded PDFs
output_dir = 'fomc_pressconf_raw'
os.makedirs(output_dir, exist_ok=True)

# Open the url_links file
with open('transcripts_links.txt', 'r') as file:
    # Read and process each line
    for line in file:
        # Remove leading and trailing whitespace (e.g., newline characters)
        url = line.strip()
        file_name = url.split('/')[-1]

        # Download the PDF file
        response = requests.get(url)
        if response.status_code == 200:
            with open(os.path.join(output_dir, file_name), 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {file_name}")
        else:
            print(f"Failed to download {file_name}")

print("Download completed.")

Downloaded FOMCpresconf20181219.pdf
Downloaded FOMCpresconf20180926.pdf
Downloaded FOMCpresconf20180613.pdf
Downloaded FOMCpresconf20180321.pdf
Downloaded FOMCpresconf20191211.pdf
Downloaded FOMCpresconf20191030.pdf
Downloaded FOMCpresconf20190918.pdf
Downloaded FOMCpresconf20190731.pdf
Downloaded FOMCpresconf20190619.pdf
Downloaded FOMCpresconf20190501.pdf
Downloaded FOMCpresconf20190320.pdf
Downloaded FOMCpresconf20190130.pdf
Downloaded FOMCpresconf20201216.pdf
Downloaded FOMCpresconf20201105.pdf
Downloaded FOMCpresconf20200916.pdf
Downloaded FOMCpresconf20200729.pdf
Downloaded FOMCpresconf20200610.pdf
Downloaded FOMCpresconf20200429.pdf
Downloaded FOMCpresconf20200315.pdf
Downloaded FOMCpresconf20200129.pdf
Downloaded FOMCpresconf20211215.pdf
Downloaded FOMCpresconf20211103.pdf
Downloaded FOMCpresconf20210922.pdf
Downloaded FOMCpresconf20210728.pdf
Downloaded FOMCpresconf20210616.pdf
Downloaded FOMCpresconf20210428.pdf
Downloaded FOMCpresconf20210317.pdf
Downloaded FOMCpresconf20210

## Download Press Realeases (Meeting Statement) from FOMC

In [73]:
# Create a directory to store the downloaded PDFs
output_dir = 'fomc_pressrelease_raw'
os.makedirs(output_dir, exist_ok=True)

# Open the url_links file
with open('pressrelease_links.txt', 'r') as file:
    # Read and process each line
    for line in file:
        # Remove leading and trailing whitespace (e.g., newline characters)
        url = line.strip()
        file_name = url.split('/')[-1]

        # Download the PDF file
        response = requests.get(url)
        if response.status_code == 200:
            with open(os.path.join(output_dir, file_name), 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {file_name}")
        else:
            print(f"Failed to download {file_name}")

print("Download completed.")

Downloaded monetary20180131a1.pdf
Downloaded monetary20180321a1.pdf
Downloaded monetary20180502a1.pdf
Downloaded monetary20180613a1.pdf
Downloaded monetary20180801a1.pdf
Downloaded monetary20180926a1.pdf
Downloaded monetary20181108a1.pdf
Downloaded monetary20181219a1.pdf
Downloaded monetary20190130a1.pdf
Downloaded monetary20190320a1.pdf
Downloaded monetary20190501a1.pdf
Downloaded monetary20190619a1.pdf
Downloaded monetary20190731a1.pdf
Downloaded monetary20190918a1.pdf
Downloaded monetary20191030a1.pdf
Downloaded monetary20191211a1.pdf
Downloaded monetary20200129a1.pdf
Downloaded monetary20200323a1.pdf
Downloaded monetary20200429a1.pdf
Downloaded monetary20200610a1.pdf
Downloaded monetary20200729a1.pdf
Downloaded monetary20200916a1.pdf
Downloaded monetary20201105a1.pdf
Downloaded monetary20201216a1.pdf
Downloaded monetary20210127a1.pdf
Downloaded monetary20210317a1.pdf
Downloaded monetary20210428a1.pdf
Downloaded monetary20210616a1.pdf
Downloaded monetary20210728a1.pdf
Downloaded mon

In [71]:
url = "https://www.federalreserve.gov/newsevents/pressreleases/monetary20180131a.htm"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    # Extract all the paragraphs from the page
    paras = soup.find_all('p')

    # Create a string to store the extracted content
    extracted_content = ""

    for para in paras:
        extracted_content += para.get_text() 
    # Define the file name to save the extracted content
    output_file = 'extracted_content.txt'

    # Write the extracted content to a text file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(extracted_content)
    
    print(f'Content has been extracted and saved to {output_file}')
else:
    print('Failed to retrieve the web page. Status code:', response.status_code)

Content has been extracted and saved to extracted_content.txt


In [72]:
# Testing 
output_directory = 'fomc_pressrelease_raw'
os.makedirs(output_directory, exist_ok=True)

url = "https://www.federalreserve.gov/monetarypolicy/files/monetary20220126a1.pdf"
response = requests.get(url)
if response.status_code == 200:
    with open(os.path.join(output_directory, file_name), 'wb') as f:
        f.write(response.content)
    print(f"Downloaded {file_name}")
else:
    print(f"Failed to download {file_name}")

Downloaded FOMCpresconf20220126.pdf


## Extract & Preprocess text from PDF files

In [65]:
# Function to extract text from a PDF file and save it as a .txt file
def extract_text_from_pdf(pdf_file_path):
    try:
        # Open the PDF file in binary mode for reading
        with open(pdf_file_path, 'rb') as pdf_file:
            # Create a PDF reader object
            pdf_reader = PyPDF2.PdfReader(pdf_file)

            # Initialize an empty string to store the extracted text
            text = ""

            # Loop through each page and extract text
            for page_number in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_number]
                text += page.extract_text()
            return text 
        
    except Exception as e:
        print(f"An error occurred: {e}")

In [74]:
# Preprocess for transcripts of press conferences 

base_path = r"E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_raw"
output_path = r"E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned"

for filename in os.listdir('fomc_pressconf_raw'):
    pdf_file_path = os.path.join(base_path, filename)
    text = extract_text_from_pdf(pdf_file_path)
    # Remove extra spaces and replace multiple spaces with a single space
    text = ' '.join(text.split())
    
    # Tokenize the input text into sentences
    sentences = sent_tokenize(text)
    # Store 
    txt_file_name = filename[:-4] + ".txt"
    txt_file_path = os.path.join(output_path, txt_file_name)
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        for sentence in sentences:
            txt_file.write(sentence.strip() + '\n')
    print(f"Text Extracted and saved to {txt_file_path}")

Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20180321.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20180613.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20180926.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20181219.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20190130.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20190320.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20190501.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20190619.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20190731.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressconf_cleaned\FOMCpresconf20190918.txt


In [80]:
## preprocess press releases 
base_path = r"E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_raw"
output_path = r"E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned"

for filename in os.listdir('fomc_pressrelease_raw'):
    pdf_file_path = os.path.join(base_path, filename)
    text = extract_text_from_pdf(pdf_file_path)
    # Remove extra spaces and replace multiple spaces with a single space
    text = ' '.join(text.split())
    
    # Tokenize the input text into sentences
    sentences = sent_tokenize(text)
    # Store 
    txt_file_name = filename[:-6] + ".txt"
    txt_file_path = os.path.join(output_path, txt_file_name)
    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
        for sentence in sentences:
            txt_file.write(sentence.strip() + '\n')
    print(f"Text Extracted and saved to {txt_file_path}")

Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20180131.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20180321.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20180502.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20180613.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20180801.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20180926.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20181108.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20181219.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20190130.txt
Text Extracted and saved to E:\NYU\7871 NLP\NLP_HW2\fomc_pressrelease_cleaned\monetary20190320.txt
Text Extra

In [79]:
## preprocess press releases 
os.listdir("fomc_pressrelease_raw")[0][:-6]

'monetary20180131'

## Download US Treasury Data

In [2]:
treasury_2y_ticker = "^IRX"
treasury_10y_ticker = "^TNX"

In [3]:
start_date = "2018-01-31"
end_date = "2023-01-01"

treasury_2y_data = yf.download(treasury_2y_ticker, start=start_date, end=end_date)
treasury_10y_data = yf.download(treasury_10y_ticker, start=start_date, end=end_date)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [4]:
treasury_10y_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-31,2.713,2.754,2.698,2.72,2.72,0
2018-02-01,2.743,2.776,2.722,2.773,2.773,0
2018-02-02,2.786,2.854,2.786,2.854,2.854,0
2018-02-05,2.834,2.862,2.794,2.794,2.794,0
2018-02-06,2.738,2.787,2.736,2.768,2.768,0


In [6]:
treasury_10y_data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-12-23,3.701,3.753,3.695,3.751,3.751,0
2022-12-27,3.787,3.862,3.787,3.86,3.86,0
2022-12-28,3.818,3.89,3.815,3.887,3.887,0
2022-12-29,3.868,3.886,3.818,3.835,3.835,0
2022-12-30,3.869,3.905,3.831,3.879,3.879,0


In [5]:
treasury_2y_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-31,1.428,1.443,1.42,1.43,1.43,0
2018-02-01,1.425,1.455,1.42,1.453,1.453,0
2018-02-02,1.445,1.455,1.445,1.45,1.45,0
2018-02-05,1.445,1.463,1.445,1.458,1.458,0
2018-02-06,1.498,1.498,1.478,1.488,1.488,0


### Attempt with FRED API

In [58]:
import fredapi as fa

In [59]:
# Enter my own FRED api_key directly
fred = fa.Fred(api_key = '0b8773b72996b749e1ed4d8590aeece2')

In [91]:
series_ids = ['GS2', 'GS10']

# Define the date range for which you want to download data
start_date = '2023-01-01'
end_date = '2023-01-31'

# Create an empty DataFrame to store the data
yield_data = pd.DataFrame()

# Download data for each series and append it to the DataFrame
for series_id in series_ids:
    data = fred.get_series(series_id, start_date, end_date)
    yield_data[series_id] = data

yield_data 

Unnamed: 0,GS2,GS10
2023-01-01,4.21,3.53


In [61]:
startDate = '2018-01-01'
endDate = '2022-12-31'
df = []
ids = ['GS{}'.format(i) for i in [1,2,5,10]]
for s in ids:
    df.append(fred.get_series(s,observation_start=startDate, observation_end=endDate)/100)

df = pd.concat(df,axis=1)
df.columns = ids
df = df.dropna()
df.head()

Unnamed: 0,GS1,GS2,GS5,GS10
2018-01-01,0.018,0.0203,0.0238,0.0258
2018-02-01,0.0196,0.0218,0.026,0.0286
2018-03-01,0.0206,0.0228,0.0263,0.0284
2018-04-01,0.0215,0.0238,0.027,0.0287
2018-05-01,0.0227,0.0251,0.0282,0.0298


In [81]:
df['Yield_Spread'] = df['GS10'] - df['GS2']
df.head()

Unnamed: 0,GS1,GS2,GS5,GS10,Yield_Spread
2018-01-01,0.018,0.0203,0.0238,0.0258,0.0055
2018-02-01,0.0196,0.0218,0.026,0.0286,0.0068
2018-03-01,0.0206,0.0228,0.0263,0.0284,0.0056
2018-04-01,0.0215,0.0238,0.027,0.0287,0.0049
2018-05-01,0.0227,0.0251,0.0282,0.0298,0.0047


In [90]:
df.to_csv('US_Treasury_prices.csv')

In [87]:
df_returns = df.copy()
df_returns = df/df.shift(1) - 1
df_returns.dropna(inplace=True)
df_returns.head()

Unnamed: 0,GS1,GS2,GS5,GS10,Yield_Spread
2018-02-01,0.088889,0.073892,0.092437,0.108527,0.236364
2018-03-01,0.05102,0.045872,0.011538,-0.006993,-0.176471
2018-04-01,0.043689,0.04386,0.026616,0.010563,-0.125
2018-05-01,0.055814,0.054622,0.044444,0.038328,-0.040816
2018-06-01,0.026432,0.007968,-0.014184,-0.02349,-0.191489


In [89]:
df_returns.to_csv('US_Treasury_returns.csv')