# 크롤링

In [25]:
import requests
from bs4 import BeautifulSoup

url = "https://www.federalreserve.gov/fomc/minutes/20060629.htm"
selector = "table > tr > td"

html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")
txt = soup.select(selector)


In [37]:
print(txt[7].text)


By unanimous vote, the Committee approved a "Report and Plan of the Federal Open Market Committee to Improve FOIA Operations" and approved a delegation of authority to the Chairman (or his designee) to take actions required under the Freedom of Information Act.  

The Manager of the System Open Market Account reported on recent developments in foreign exchange markets.  There were no open market operations in foreign currencies for the System's account in the period since the previous meeting.  The Manager also reported on developments in domestic financial markets and on System open market transactions in government securities and federal agency obligations during the period since the previous meeting.  By unanimous vote, the Committee ratified these transactions.  

The information reviewed at the June meeting suggested that the growth of economic activity in the second quarter slowed substantially from its rapid first-quarter pace.  The expansion of consumer spending softened, 

In [38]:
url = "https://www.federalreserve.gov/fomc/minutes/20041110.htm"
selector = "table > tr > td"

html = requests.get(url)
soup = BeautifulSoup(html.text, "lxml")
txt = soup.select(selector)


In [42]:
print(txt[7].text)


By unanimous vote, the minutes of the meeting of the Federal Open Market Committee held on September 21, 2004, were approved.
By unanimous vote, the Federal Open Market Committee approved the selection of Deborah J. Danker as Deputy Secretary of the Committee to serve until the selection of a successor at the first regularly scheduled meeting after December 31, 2004.
The Manager of the System Open Market Account reported on recent developments in foreign exchange markets.  There were no open market operations in foreign currencies for the Systemï¿½s account in the period since the previous meeting.
The Manager reported on recent developments in domestic financial markets and on System open market transactions in government securities and securities issued or fully guaranteed by federal agencies during the period September 21, 2004, through November 9, 2004.  By unanimous vote, the Committee ratified these transactions.
The Manager also discussed the pressures on the federal funds rate

# 크롤링

In [3]:
import requests
from bs4 import BeautifulSoup

def FOMC_crawling(date):
    """
    FOMC 회의록을 크롤링하는 함수
    
    Args(str): 회의 일자
    
    Returns(str): 회의록 본문 텍스트 데이터

    """
    #-------------FOMC 회의록 크롤링-------------#
    
    url = f"https://www.federalreserve.gov/monetarypolicy/fomcminutes{date}.htm"
    selector = "#article"

    html = requests.get(url)
    html.encoding = "UTF-8"  # 글자 깨짐 방지
    soup = BeautifulSoup(html.text, "lxml")
    txt = soup.select(selector)[0].text
    
    #-----상한선과 하한선 설정 후 그 사이에 있는 텍스트만 추출-----#
    
    upper_line = "Developments in Financial Markets and Open Market Operations"
    upper_line2 = "Discussion of Financial Markets and Open Market Operations"
    upper_line3 = "AUTHORIZATION FOR DOMESTIC OPEN MARKET OPERATIONS" 
    upper_line4 = "Developments in Financial Markets, Open Market Operations, and Policy Normalization"
    upper_line5 = "Developments in Financial Markets and the Federal Reserve's Balance Sheet"
    lower_line = "_______________________"
    
    if upper_line in txt: 
        main_txt = upper_line + txt.split(upper_line)[1]
    elif upper_line2 in txt: 
        main_txt = upper_line2 + txt.split(upper_line2)[1]
    elif upper_line3 in txt: 
        main_txt = upper_line3 + txt.split(upper_line3)[1]
    elif upper_line4 in txt: 
        main_txt = upper_line4 + txt.split(upper_line4)[1]
    else:
        main_txt = upper_line5 + txt.split(upper_line5)[1]
        
    main_txt2 = main_txt.split(lower_line)[0]

    return main_txt2

if __name__ == "__main__":
    test_txt = FOMC_crawling("20231213")
    print(test_txt[:603])

Developments in Financial Markets and Open Market Operations 
The manager turned first to a review of developments in financial markets over the intermeeting period. Financial conditions eased, driven by a decline in interest rates, an increase in equity prices, and a depreciation in the dollar. The rise in equity prices was supported by the decline in Treasury yields and by earnings growth that exceeded consensus expectations. Implied volatility for equities diminished notably. The easing in financial conditions reversed some of the tightening that occurred over the summer and much of the fall.


In [4]:
# 회의 날짜 -> 각각의 url 코드에 사용
meeting_dates = ["20231213", "20231101", "20230920", "20230726", "20230614", 
                 "20230503", "20230322", "20230201", "20221214", "20221102",
                 "20220921", "20220727", "20220615", "20220504", "20220316",
                 "20220126", "20211215", "20211103", "20210922", "20210728",
                 "20210616", "20210428", "20210317", "20210127", "20201216",
                 "20201105", "20200916", "20200729", "20200610", "20200429",
                 "20200315", "20200129", "20191211", "20191030", "20190918",
                 "20190731", "20190619", "20190501", "20190320", "20190130", # test 40부터 테스트
                 "20181219", "20181108", "20180926", "20180801", "20180613",
                 "20180502", "20180321", "20180131", "20171213", "20171101",
                 "20170920", "20170726", "20170614", "20170503", "20170315",
                 "20170201", "20161214", "20161102", "20160921", "20160727",
                 "20160615", "20160427", "20160316", "20160127", "20151216",
                 "20151028", "20150917", "20150729", "20150617", "20150429", # 65부터 테스트
                 "20150318", "20150128", "20141217", "20141029", "20140917",
                 "20140730", "20140618", "20140430", "20140319", "20140129",
                 "20131218", "20131030", "20130918", "20130731", "20130619",
                 "20130501", "20130320", "20130130", "20121212", "20121024",
                 "20120913", "20120801", "20120620", "20120425", "20120313"]

print(len(meeting_dates))

95


In [5]:
# 리스트에 각각의 문서의 본문 텍스트 데이터 저장
from tqdm.notebook import tqdm

doc_lst = []

for date in tqdm(meeting_dates):
#     print(date)
    txt = FOMC_crawling(date)
    doc_lst.append(txt)


  0%|          | 0/95 [00:00<?, ?it/s]

# 토큰화

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

def FOMC_tokenize(doc):
    """
    문서 내의 텍스트를 토큰화하는 함수
    
    Args(str): 회의록 본문 텍스트 데이터
    
    Returns(list): 문장 단위, 단어 단위로 토큰화 된 2차원 리스트
    """
    
    #-------------문장 단위 토큰화-------------#
    sent_tokens = sent_tokenize(doc)
    
    
    #-------------단어 단위 토큰화-------------#
    word_tokens = [word_tokenize(sentence) for sentence in sent_tokens]
    
    return word_tokens

if __name__ == "__main__":
    word_tokens = FOMC_tokenize(doc_lst[0])
    print(word_tokens[:3])
    



[['Developments', 'in', 'Financial', 'Markets', 'and', 'Open', 'Market', 'Operations', 'The', 'manager', 'turned', 'first', 'to', 'a', 'review', 'of', 'developments', 'in', 'financial', 'markets', 'over', 'the', 'intermeeting', 'period', '.'], ['Financial', 'conditions', 'eased', ',', 'driven', 'by', 'a', 'decline', 'in', 'interest', 'rates', ',', 'an', 'increase', 'in', 'equity', 'prices', ',', 'and', 'a', 'depreciation', 'in', 'the', 'dollar', '.'], ['The', 'rise', 'in', 'equity', 'prices', 'was', 'supported', 'by', 'the', 'decline', 'in', 'Treasury', 'yields', 'and', 'by', 'earnings', 'growth', 'that', 'exceeded', 'consensus', 'expectations', '.']]


In [7]:
# 리스트에 각각의 문서의 토큰화된 본문 텍스트 데이터 저장
tokenized_doc_lst = []

for doc in tqdm(doc_lst):
    tokenized_doc = FOMC_tokenize(doc)
    tokenized_doc_lst.append(tokenized_doc)
    

  0%|          | 0/95 [00:00<?, ?it/s]

In [9]:
import itertools

# 문장 단위로 저장된 2차원 데이터
sentence_tokens = list(itertools.chain(*tokenized_doc_lst))
# 단어 단위로 저장된 1차원 데이터
word_tokens = list(itertools.chain(*sentence_tokens))

print(tokenized_doc_lst[0][:2])
print("------------------------------------------------------------")
print("문서 개수:", len(tokenized_doc_lst))
print("문장 개수:", len(sentence_tokens))
print("단어 개수:", len(word_tokens))


[['Developments', 'in', 'Financial', 'Markets', 'and', 'Open', 'Market', 'Operations', 'The', 'manager', 'turned', 'first', 'to', 'a', 'review', 'of', 'developments', 'in', 'financial', 'markets', 'over', 'the', 'intermeeting', 'period', '.'], ['Financial', 'conditions', 'eased', ',', 'driven', 'by', 'a', 'decline', 'in', 'interest', 'rates', ',', 'an', 'increase', 'in', 'equity', 'prices', ',', 'and', 'a', 'depreciation', 'in', 'the', 'dollar', '.']]
------------------------------------------------------------
문서 개수: 95
문장 개수: 25106
문장 개수: 778990


# Word2Vec

In [14]:
from gensim.models import Word2Vec

model = Word2Vec(sentence_tokens, vector_size=500, window=5, min_count=5, workers=4)

In [15]:
# 임베딩된 단어 벡터 추출
word_vectors = model.wv

# 모델에 포함된 모든 단어들을 얻기
words = list(word_vectors.key_to_index.keys())

# 단어들 출력
print(words[:20])
print(len(words))

['the', ',', '.', 'of', 'in', 'and', 'to', 'that', 'a', 'for', 'on', 'inflation', 'Committee', 'as', 'rate', 'was', 'The', 'at', 'participants', "'s"]
4340


In [16]:
# 임베딩된 단어 벡터를 얻기
word_vectors = model.wv

# 특정 단어에 대한 벡터를 가져오기
print(word_vectors['inflation'][:20])

# 비슷한 단어 찾기
print(word_vectors.most_similar('inflation'))

[-0.5379805   0.3309589   0.34410438  0.66475385 -0.71483195 -1.1962577
  0.32823837 -0.22933301 -0.537966   -0.00320659 -0.08531485  0.18824793
 -0.63304794 -0.08955514  0.34325454  0.45615113 -0.37901092  0.24031207
  0.97446567  0.47564745]
[('longer-run', 0.5862677097320557), ('objective', 0.5525201559066772), ('stay', 0.5336916446685791), ('goal', 0.5179406404495239), ('running', 0.5096482038497925), ('Inflation', 0.5056201219558716), ('2', 0.4960033595561981), ('anchored', 0.48972374200820923), ('remain', 0.4884224832057953), ('run', 0.4850694239139557)]
