In [1]:
from urllib import request
from bs4 import BeautifulSoup
import nltk, re, pprint
from nltk import word_tokenize, sent_tokenize
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import stanza
from finbert_embedding.embedding import FinbertEmbedding
from scipy.spatial.distance import cosine

In [2]:
# url = 'https://www.federalreserve.gov/monetarypolicy/fomcminutes20210127.htm'
# html = request.urlopen(url).read().decode('utf8')
# raw = BeautifulSoup(html, 'html.parser').get_text()
# nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')
# stanza.download('en')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sarahwang688/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sarahwang688/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/sarahwang688/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
text_raw_data = pd.read_csv('fed_speeches_1996_2020.csv')
text_raw_data = text_raw_data.loc[text_raw_data.year>2010]
# text_str = text_raw_data.text.str.cat()
# text_str = text_str.lower()
# tokens = sent_tokenize(text_str)


In [4]:
# pd.DataFrame(tokens[:2])

In [4]:
sentence_df = pd.DataFrame()
for index, row in text_raw_data.iterrows():
    tokens = sent_tokenize(row.text)
    tokens_df = pd.DataFrame(tokens)
    tokens_df.columns = ['sentence']
    tokens_df['date'] = row.date
    sentence_df = pd.concat([sentence_df, tokens_df], axis = 0)
sentence_df.sentence = sentence_df.sentence.str.lower()

In [5]:

sentence_df = sentence_df[~sentence_df.sentence.str.contains('return to text')]
sentence_df = sentence_df.reset_index(drop=True)
sentence_df

Unnamed: 0,sentence,date
0,good morning.,20111129.0
1,"i'm delighted to return ""home"" to the federal ...",20111129.0
2,in my remarks this morning i will underscore t...,20111129.0
3,the global economy is facing critical challenges.,20111129.0
4,the recovery in the united states and other ad...,20111129.0
...,...,...
62874,the bank's ratio would be compared to a minimu...,20200108.0
62875,the minimum activity threshold would be calibr...,20200108.0
62876,board of governors of the federal reserve syst...,20200108.0
62877,"lael brainard, ""the community reinvestment act...",20200108.0


In [13]:
sentence_df = sentence_df[sentence_df.sentence.str.split().str.len()>8]

In [14]:
finbert = FinbertEmbedding()

In [8]:
# word_embedding = finbert.word_vector(tokens[0])
# sentence_embedding = finbert.sentence_vector(tokens[0])
# sentence_embedding

In [9]:
# word_embedding

In [15]:
aspects_list = ['inflation', 'growth', 'employment','unemployment', 'economic'] # interest rate
aspects = ' '.join(aspects_list)

In [36]:
aspect_embedding = finbert.word_vector(aspects)
len(aspect_embedding[0])

768

In [12]:
# sentence_embedding

In [17]:
def calc_cos_distance(sentence_embedding):
    cos_list = []
    for i in np.arange(len(aspects_list)):
        cos = 1-cosine(sentence_embedding, aspect_embedding[i])
        cos_list = cos_list + [cos]
    print(cos_list)
    return cos_list

In [18]:
cos_similiary = sentence_df.copy()
for aspect in aspects_list:
    cos_similiary[aspect] = np.nan
cos_similiary

Unnamed: 0,sentence,date,inflation,growth,employment,unemployment,economic
1,"i'm delighted to return ""home"" to the federal ...",20111129.0,,,,,
2,in my remarks this morning i will underscore t...,20111129.0,,,,,
4,the recovery in the united states and other ad...,20111129.0,,,,,
5,there have also been clear signs of slowing gr...,20111129.0,,,,,
6,"in effect, we face a dearth of aggregate deman...",20111129.0,,,,,
...,...,...,...,...,...,...,...
62874,the bank's ratio would be compared to a minimu...,20200108.0,,,,,
62875,the minimum activity threshold would be calibr...,20200108.0,,,,,
62876,board of governors of the federal reserve syst...,20200108.0,,,,,
62877,"lael brainard, ""the community reinvestment act...",20200108.0,,,,,


In [20]:
# aspects_list = ['inflation', 'growth', 'employment','unemployment', 'economic'] 
for index, row in cos_similiary.iterrows():
    print(index)
    sentence_embedding = finbert.sentence_vector(row.sentence)
    cos = calc_cos_distance(sentence_embedding)
    cos_similiary.loc[index, 'inflation'] = cos[0]
    cos_similiary.loc[index, 'growth'] = cos[1]
    cos_similiary.loc[index, 'employment'] = cos[2]
    cos_similiary.loc[index, 'unemployment'] = cos[3]
    cos_similiary.loc[index, 'economic'] = cos[4]
cos_similiary.to_csv('cos_similarity_sent_embed_0702.csv')

1
[0.2825271189212799, 0.3457186818122864, 0.33066326379776, 0.3148365318775177, 0.25084686279296875]
2
[0.28591930866241455, 0.3580319285392761, 0.32202276587486267, 0.31155917048454285, 0.2444530874490738]
4
[0.33045071363449097, 0.40473052859306335, 0.4229705035686493, 0.3944837152957916, 0.3078695237636566]
5
[0.2980571985244751, 0.37940752506256104, 0.32311302423477173, 0.29225483536720276, 0.26604342460632324]
6
[0.3548320531845093, 0.4184200167655945, 0.3778090476989746, 0.3781068027019501, 0.31464916467666626]
8
[0.33247366547584534, 0.38836735486984253, 0.3410789370536804, 0.3402951955795288, 0.2747322618961334]
9
[0.3423345983028412, 0.4216761589050293, 0.3994542062282562, 0.37852656841278076, 0.3489871919155121]
10
[0.28688254952430725, 0.3502727150917053, 0.31716594099998474, 0.2993454933166504, 0.2288358360528946]
11
[0.3162265121936798, 0.38983750343322754, 0.35873591899871826, 0.3381117880344391, 0.29418155550956726]
12
[0.3274323046207428, 0.41986143589019775, 0.3647466

In [21]:
cos_similiary


Unnamed: 0,sentence,date,inflation,growth,employment,unemployment,economic
1,"i'm delighted to return ""home"" to the federal ...",20111129.0,0.282527,0.345719,0.330663,0.314837,0.250847
2,in my remarks this morning i will underscore t...,20111129.0,0.285919,0.358032,0.322023,0.311559,0.244453
4,the recovery in the united states and other ad...,20111129.0,0.330451,0.404731,0.422971,0.394484,0.307870
5,there have also been clear signs of slowing gr...,20111129.0,0.298057,0.379408,0.323113,0.292255,0.266043
6,"in effect, we face a dearth of aggregate deman...",20111129.0,0.354832,0.418420,0.377809,0.378107,0.314649
...,...,...,...,...,...,...,...
62874,the bank's ratio would be compared to a minimu...,20200108.0,0.284297,0.361377,0.320891,0.314664,0.253812
62875,the minimum activity threshold would be calibr...,20200108.0,0.326489,0.386329,0.332340,0.351731,0.271299
62876,board of governors of the federal reserve syst...,20200108.0,0.308902,0.361910,0.336381,0.346712,0.306534
62877,"lael brainard, ""the community reinvestment act...",20200108.0,0.309505,0.386816,0.385695,0.375667,0.314872


In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

In [23]:
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [24]:
from models.finBERT.finbert.finbert import predict

In [25]:
for index, row in cos_similiary.iterrows():
    
    max_aspect_value = row[aspects_list]
    aspect = max_aspect_value.loc[max_aspect_value==max_aspect_value.max()].index[0]
    cos_similiary.loc[index, 'aspect'] = aspect

In [42]:
test = cos_similiary.head(2)

In [57]:
max_aspect_value.loc[max_aspect_value==max_aspect_value.max()].index[0]

'growth'

In [30]:
cos_similiary.loc[cos_similiary.aspect=='inflation']

Unnamed: 0,sentence,date,inflation,growth,employment,unemployment,economic,aspect
4322,"consequently, we are paying close attention to...",20110411.0,0.404405,0.402377,0.345789,0.364183,0.297142,inflation
18566,washington: board of governors of the federal ...,20130301.0,0.30526,0.298278,0.278803,0.301602,0.258431,inflation
25370,washington: board of governors of the federal ...,20151203.0,0.26211,0.260678,0.243311,0.260601,0.227598,inflation


In [31]:
grouped = cos_similiary.groupby('aspect')
for name, group in grouped:
    group.sentence.to_csv(f'{name}_0702.txt', header=None, index=None, sep=' ', mode='a')

In [32]:
with open('growth_0702.txt', 'r') as f:
    text = f.read()

In [33]:
predict(text, model, write_to_csv = True, path = 'sent_embedding_growth_sentiment_0702.csv')

Unnamed: 0,sentence,logit,prediction,sentiment_score
0,"""i'm delighted to return """"home"""" to the feder...","[0.59031725, 0.01745192, 0.3922308]",positive,0.572865
1,"""in my remarks this morning i will underscore ...","[0.919799, 0.011976549, 0.06822443]",positive,0.907822
2,"""there have also been clear signs of slowing g...","[0.017191192, 0.96347314, 0.019335713]",negative,-0.946282
3,"""in effect, we face a dearth of aggregate dema...","[0.04523633, 0.76017135, 0.19459234]",negative,-0.714935
4,"""central banks in a number of advanced economi...","[0.16111232, 0.3384518, 0.5004358]",neutral,-0.177339
...,...,...,...,...
48902,"""the bank's ratio would be compared to a minim...","[0.098906234, 0.016295958, 0.8847978]",neutral,0.082610
48903,"""the minimum activity threshold would be calib...","[0.042753227, 0.017734447, 0.9395124]",neutral,0.025019
48904,"""board of governors of the federal reserve sys...","[0.086596705, 0.015615083, 0.8977882]",neutral,0.070982
48905,"""lael brainard, """"the community reinvestment a...","[0.101868406, 0.016642699, 0.8814889]",neutral,0.085226
