In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

In [2]:
import torch
import pandas as pd
import numpy as np
model.eval()



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [3]:
sentences = ["Nordea Group 's operating profit increased in 2010 by 18 percent year-on-year to 3.64 billion euros and total revenue by 3 percent to 9.33 billion euros ."]

inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = model(**inputs)[0]

labels = {0:'neutral', 1:'positive', 2:'negative'}
for idx, sent in enumerate(sentences):
    print(sent, '\n----', labels[np.argmax(outputs.detach().numpy()[idx])], '----')
    print((outputs.detach().numpy()[idx]))
    

Nordea Group 's operating profit increased in 2010 by 18 percent year-on-year to 3.64 billion euros and total revenue by 3 percent to 9.33 billion euros . 
---- positive ----
[-5.8900237 10.140575  -8.0652   ]


In [4]:
benzinga_csv_path = "/app/StockPricePredictions/data/benzinga/aapl_non_dupes_text_only.csv"

In [5]:
df_benzinga = pd.read_csv(benzinga_csv_path, low_memory=False)

In [6]:
df_benzinga['date'] = pd.to_datetime(df_benzinga['date'])

In [7]:
df_benzinga.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29077 entries, 0 to 29076
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    29077 non-null  datetime64[ns]
 1   text    29077 non-null  object        
dtypes: datetime64[ns](1), object(1)
memory usage: 454.5+ KB


In [8]:
df_benzinga.head(1)

Unnamed: 0,date,text
0,2010-01-02,Time to Sell Apple Puts In case you missed the...


In [9]:
headlines_array = np.array(df_benzinga)
headlines_list = list(headlines_array[:,1])
date_list = list(headlines_array[:, 0])

print(headlines_list[0])
print(date_list[0])

Time to Sell Apple Puts In case you missed the meteoric run in Apple  over the last several months, there may still be a way to profit without having to chase the stock at these lofty levels: selling puts. When you sell puts you in a sense become an insurance salesman. You agree to buy the stock at some point in the future, should the stock fall to that level or lower before a given date. For this agreement, you receive a premium up front.    With Apple currently trading at $210/share, just off its 52-week high, I propose selling out-of-the-money puts for April or July. The April $190 contract bids at $8.15 and the July $180 contract bids at $10.25.     I chose such long-dated contracts for their healty premiums. Just a reminder that the first rule of thumb with selling puts is to only sell puts on a stock you would be willing to own. Since I firmly believe in Apple's fundamentals and future business prospects, I would willingly buy the stock at $190 in April. However, nothing is certa

In [10]:
headlines_table = pd.DataFrame(columns=["Headline", "Stock", "Positive", "Negative", "Neutral"])

In [11]:
# headlines_table.to_csv("./apple_finbert_20100101_20220304.csv", index=False)


In [12]:
# headlines_table2 = pd.read_csv("./apple_finbert_20100101_20220304.csv")

In [13]:
# import torch

def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


STRIDE = 1

model.eval()

n=0
for headline, date in zip(chunk_list(headlines_list, STRIDE), chunk_list(date_list, STRIDE)):
  
  input = tokenizer(headline, padding = True, truncation = True,  return_tensors='pt', max_length=512)
  
  outputs = model(**input)

  prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)

  print(f"{n+1}/{int(len(headlines_list)/STRIDE)}") 

  for headline, date, pos, neg, neutr in zip(headline, date, prediction[:, 0].tolist(), prediction[:, 1].tolist(), prediction[:, 2].tolist() ): 
    # headlines_table.add_data(headline, stock, pos, neg, neutr)
    headlines_table.loc[  len(headlines_table)  ] = [headline, date, pos, neg, neutr]
   
  n+=1

  if n % 1000 == 0:
    # print(headlines_table.head())
    headlines_table.to_csv("./apple_finbert_20100101_20220304.csv", index=False)


1/29077
2/29077
3/29077
4/29077
5/29077
6/29077
7/29077
8/29077
9/29077
10/29077
11/29077
12/29077
13/29077
14/29077
15/29077
16/29077
17/29077
18/29077
19/29077
20/29077
21/29077
22/29077
23/29077
24/29077
25/29077
26/29077
27/29077
28/29077
29/29077
30/29077
31/29077
32/29077
33/29077
34/29077
35/29077
36/29077
37/29077
38/29077
39/29077
40/29077
41/29077
42/29077
43/29077
44/29077
45/29077
46/29077
47/29077
48/29077
49/29077
50/29077
51/29077
52/29077
53/29077
54/29077
55/29077
56/29077
57/29077
58/29077
59/29077
60/29077
61/29077
62/29077
63/29077
64/29077
65/29077
66/29077
67/29077
68/29077
69/29077
70/29077
71/29077
72/29077
73/29077
74/29077
75/29077
76/29077
77/29077
78/29077
79/29077
80/29077
81/29077
82/29077
83/29077
84/29077
85/29077
86/29077
87/29077
88/29077
89/29077
90/29077
91/29077
92/29077
93/29077
94/29077
95/29077
96/29077
97/29077
98/29077
99/29077
100/29077
101/29077
102/29077
103/29077
104/29077
105/29077
106/29077
107/29077
108/29077
109/29077
110/29077
111/2907

In [14]:
headlines_table.to_csv("./apple_finbert_20100101_20220304.csv", index=False)
headlines_table.head()

Unnamed: 0,Headline,Stock,Positive,Negative,Neutral
0,Time to Sell Apple Puts In case you missed the...,2010-01-02,0.93772,0.061807,0.000473
1,Nexus – Will It Change Telecom Industry Dynami...,2010-01-04,0.010036,0.000452,0.989511
2,"Company News for January 04, 2010 - Corporate ...",2010-01-04,0.006489,0.993502,9e-06
3,"Technology Industry Update (DELL, AAPL, AMD, I...",2010-01-04,0.036153,0.152172,0.811675
4,Apple Defies Gravity Apple aficionados have b...,2010-01-04,0.961501,0.02486,0.013639


In [15]:
test_table = pd.DataFrame(columns=["Headline", "Stock", "Positive", "Negative", "Neutral"])

In [16]:
# appended_data = []

# for i in range(3):
#     test_table.loc[  len(test_table)  ] = ["my", "new", "data", "row", "test"]

In [17]:
# test_table