In [1]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification
# tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
# model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [2]:
import torch
import pandas as pd
import numpy as np
import re
model.eval()



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [3]:
sentences = ["Nordea Group 's operating profit increased in 2010 by 18 percent year-on-year to 3.64 billion euros and total revenue by 3 percent to 9.33 billion euros ."]

inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = model(**inputs)[0]

labels = {0:'neutral', 1:'positive', 2:'negative'}
for idx, sent in enumerate(sentences):
    print(sent, '\n----', labels[np.argmax(outputs.detach().numpy()[idx])], '----')
    print((outputs.detach().numpy()[idx]))
    

Nordea Group 's operating profit increased in 2010 by 18 percent year-on-year to 3.64 billion euros and total revenue by 3 percent to 9.33 billion euros . 
---- neutral ----
[ 2.0812669 -1.9406649 -1.527481 ]


In [4]:
benzinga_csv_path = "/app/StockPricePredictions/data/twint/INTC/INTC_20100101_to_20220304.csv"



In [5]:
df_benzinga = pd.read_csv(benzinga_csv_path, low_memory=False, index_col="id", lineterminator='\n')

In [6]:
df_benzinga['date'] = pd.to_datetime(df_benzinga['date'])

In [7]:
df_benzinga.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3498 entries, 11759352927 to 1499919740239855622
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   conversation_id  3498 non-null   int64         
 1   created_at       3498 non-null   float64       
 2   date             3498 non-null   datetime64[ns]
 3   timezone         3498 non-null   int64         
 4   place            2 non-null      object        
 5   tweet            3498 non-null   object        
 6   language         3498 non-null   object        
 7   hashtags         3498 non-null   object        
 8   cashtags         3498 non-null   object        
 9   user_id          3498 non-null   int64         
 10  user_id_str      3498 non-null   int64         
 11  username         3498 non-null   object        
 12  name             3498 non-null   object        
 13  day              3498 non-null   int64         
 14  hour           

In [8]:
print(df_benzinga.tweet.head(1))

id
11759352927    How Intel Screwed Itself Out Of The iPad Revol...
Name: tweet, dtype: object


In [9]:
df_benzinga.columns

Index(['conversation_id', 'created_at', 'date', 'timezone', 'place', 'tweet',
       'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
       'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
       'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
       'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

In [10]:
def clean_text(text):

    clean_text1 = re.sub(r'https?://[A-Za-z0-9./]+', '', text, flags=re.MULTILINE)

    clean_text2 = re.sub(r'\$INTC', '', clean_text1, flags=re.MULTILINE)

    return clean_text2


In [11]:
df_benzinga["tweet"] = df_benzinga["tweet"].apply(clean_text)

In [12]:
headlines_array = np.array(df_benzinga)
headlines_list = list(headlines_array[:,5])
date_list = list(headlines_array[:, 2])

retweet_list = list(headlines_array[:,23])

print(headlines_list[0])
print(date_list[0])
print(retweet_list[0])

How Intel Screwed Itself Out Of The iPad Revolution  $AAPL by @stevecheney  
2010-04-07 00:00:00
5


In [13]:
headlines_table = pd.DataFrame(columns=["Headline", "Stock", "Positive", "Negative", "Neutral"])

In [14]:
# headlines_table.to_csv("./apple_finbert_20100101_20220304.csv", index=False)


In [15]:
# headlines_table2 = pd.read_csv("./apple_finbert_20100101_20220304.csv")

In [16]:
# import torch

def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


STRIDE = 1

model.eval()

n=0
for headline, date in zip(chunk_list(headlines_list, STRIDE), chunk_list(date_list, STRIDE)):
  
  input = tokenizer(headline, padding = True, truncation = True,  return_tensors='pt', max_length=512)
  
  outputs = model(**input)

  prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)

  print(f"{n+1}/{int(len(headlines_list)/STRIDE)}") 

  for headline, date, pos, neg, neutr in zip(headline, date, prediction[:, 0].tolist(), prediction[:, 1].tolist(), prediction[:, 2].tolist() ): 
    # headlines_table.add_data(headline, stock, pos, neg, neutr)
    headlines_table.loc[  len(headlines_table)  ] = [headline, date, pos, neg, neutr]
   
  n+=1

  if n % 1000 == 0:
    # print(headlines_table.head())
    headlines_table.to_csv("./intc_finbert_twint_20100101_20220304.csv", index=False)


1/3498
2/3498
3/3498
4/3498
5/3498
6/3498
7/3498
8/3498
9/3498
10/3498
11/3498
12/3498
13/3498
14/3498
15/3498
16/3498
17/3498
18/3498
19/3498
20/3498
21/3498
22/3498
23/3498
24/3498
25/3498
26/3498
27/3498
28/3498
29/3498
30/3498
31/3498
32/3498
33/3498
34/3498
35/3498
36/3498
37/3498
38/3498
39/3498
40/3498
41/3498
42/3498
43/3498
44/3498
45/3498
46/3498
47/3498
48/3498
49/3498
50/3498
51/3498
52/3498
53/3498
54/3498
55/3498
56/3498
57/3498
58/3498
59/3498
60/3498
61/3498
62/3498
63/3498
64/3498
65/3498
66/3498
67/3498
68/3498
69/3498
70/3498
71/3498
72/3498
73/3498
74/3498
75/3498
76/3498
77/3498
78/3498
79/3498
80/3498
81/3498
82/3498
83/3498
84/3498
85/3498
86/3498
87/3498
88/3498
89/3498
90/3498
91/3498
92/3498
93/3498
94/3498
95/3498
96/3498
97/3498
98/3498
99/3498
100/3498
101/3498
102/3498
103/3498
104/3498
105/3498
106/3498
107/3498
108/3498
109/3498
110/3498
111/3498
112/3498
113/3498
114/3498
115/3498
116/3498
117/3498
118/3498
119/3498
120/3498
121/3498
122/3498
123/3498
1

In [17]:
headlines_table.to_csv("./intc_finbert_twint_20100101_20220304.csv", index=False)
headlines_table.head()

Unnamed: 0,Headline,Stock,Positive,Negative,Neutral
0,How Intel Screwed Itself Out Of The iPad Revol...,2010-04-07,0.041513,0.058927,0.89956
1,Google TV: Everything You Need To Know $GOOG ...,2010-05-21,0.023611,0.03188,0.944508
2,10 Things You Need To Know This Morning $AAPL ...,2010-08-03,0.032594,0.032153,0.935253
3,is one of the most hated stocks i have ever ...,2010-08-10,0.025223,0.822634,0.152143
4,Intel to Buy McAfee for $7.68 Billion to Add S...,2010-08-19,0.15212,0.009041,0.838839


In [18]:
test_table = pd.DataFrame(columns=["Headline", "Stock", "Positive", "Negative", "Neutral"])

In [19]:
# appended_data = []

# for i in range(3):
#     test_table.loc[  len(test_table)  ] = ["my", "new", "data", "row", "test"]

In [20]:
# test_table