In [84]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

In [85]:
import torch
import pandas as pd
import numpy as np
import re
model.eval()



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [86]:
sentences = ["Nordea Group 's operating profit increased in 2010 by 18 percent year-on-year to 3.64 billion euros and total revenue by 3 percent to 9.33 billion euros ."]

inputs = tokenizer(sentences, return_tensors="pt", padding=True)
outputs = model(**inputs)[0]

labels = {0:'neutral', 1:'positive', 2:'negative'}
for idx, sent in enumerate(sentences):
    print(sent, '\n----', labels[np.argmax(outputs.detach().numpy()[idx])], '----')
    print((outputs.detach().numpy()[idx]))
    

Nordea Group 's operating profit increased in 2010 by 18 percent year-on-year to 3.64 billion euros and total revenue by 3 percent to 9.33 billion euros . 
---- positive ----
[-5.8900237 10.140575  -8.0652   ]


In [87]:
benzinga_csv_path = "/app/StockPricePredictions/data/twint/AAPL_20100101_to_20220304.csv"



In [88]:
df_benzinga = pd.read_csv(benzinga_csv_path, low_memory=False, index_col="id", lineterminator='\n')

In [89]:
df_benzinga['date'] = pd.to_datetime(df_benzinga['date'])

In [90]:
df_benzinga.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41056 entries, 7273036289 to 1499907432851525632
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   conversation_id  41056 non-null  int64         
 1   created_at       41056 non-null  float64       
 2   date             41056 non-null  datetime64[ns]
 3   timezone         41056 non-null  int64         
 4   place            25 non-null     object        
 5   tweet            41056 non-null  object        
 6   language         41056 non-null  object        
 7   hashtags         41056 non-null  object        
 8   cashtags         41056 non-null  object        
 9   user_id          41056 non-null  int64         
 10  user_id_str      41056 non-null  int64         
 11  username         41056 non-null  object        
 12  name             41056 non-null  object        
 13  day              41056 non-null  int64         
 14  hour           

In [91]:
print(df_benzinga.tweet.head(1))

id
7273036289    Apple 2.0: Tablet: Big iPhone or thin MacBook?...
Name: tweet, dtype: object


In [92]:
df_benzinga.columns

Index(['conversation_id', 'created_at', 'date', 'timezone', 'place', 'tweet',
       'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
       'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
       'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
       'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest'],
      dtype='object')

In [93]:
def clean_text(text):

    clean_text1 = re.sub(r'https?://[A-Za-z0-9./]+', '', text, flags=re.MULTILINE)

    clean_text2 = re.sub(r'\$AAPL', '', clean_text1, flags=re.MULTILINE)

    return clean_text2


In [94]:
df_benzinga["tweet"] = df_benzinga["tweet"].apply(clean_text)

In [95]:
headlines_array = np.array(df_benzinga)
headlines_list = list(headlines_array[:,5])
date_list = list(headlines_array[:, 2])

retweet_list = list(headlines_array[:,23])

print(headlines_list[0])
print(date_list[0])
print(retweet_list[0])

Apple 2.0: Tablet: Big iPhone or thin MacBook?   
2010-01-01 00:00:00
7


In [96]:
headlines_table = pd.DataFrame(columns=["Headline", "Stock", "Positive", "Negative", "Neutral"])

In [97]:
# headlines_table.to_csv("./apple_finbert_20100101_20220304.csv", index=False)


In [98]:
# headlines_table2 = pd.read_csv("./apple_finbert_20100101_20220304.csv")

In [99]:
# import torch

def chunk_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


STRIDE = 1

model.eval()

n=0
for headline, date in zip(chunk_list(headlines_list, STRIDE), chunk_list(date_list, STRIDE)):
  
  input = tokenizer(headline, padding = True, truncation = True,  return_tensors='pt', max_length=512)
  
  outputs = model(**input)

  prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)

  print(f"{n+1}/{int(len(headlines_list)/STRIDE)}") 

  for headline, date, pos, neg, neutr in zip(headline, date, prediction[:, 0].tolist(), prediction[:, 1].tolist(), prediction[:, 2].tolist() ): 
    # headlines_table.add_data(headline, stock, pos, neg, neutr)
    headlines_table.loc[  len(headlines_table)  ] = [headline, date, pos, neg, neutr]
   
  n+=1

  if n % 1000 == 0:
    # print(headlines_table.head())
    headlines_table.to_csv("./apple_finbert_twint_20100101_20220304.csv", index=False)


1/41056
2/41056
3/41056
4/41056
5/41056
6/41056
7/41056
8/41056
9/41056
10/41056
11/41056
12/41056
13/41056
14/41056
15/41056
16/41056
17/41056
18/41056
19/41056
20/41056
21/41056
22/41056
23/41056
24/41056
25/41056
26/41056
27/41056
28/41056
29/41056
30/41056
31/41056
32/41056
33/41056
34/41056
35/41056
36/41056
37/41056
38/41056
39/41056
40/41056
41/41056
42/41056
43/41056
44/41056
45/41056
46/41056
47/41056
48/41056
49/41056
50/41056
51/41056
52/41056
53/41056
54/41056
55/41056
56/41056
57/41056
58/41056
59/41056
60/41056
61/41056
62/41056
63/41056
64/41056
65/41056
66/41056
67/41056
68/41056
69/41056
70/41056
71/41056
72/41056
73/41056
74/41056
75/41056
76/41056
77/41056
78/41056
79/41056
80/41056
81/41056
82/41056
83/41056
84/41056
85/41056
86/41056
87/41056
88/41056
89/41056
90/41056
91/41056
92/41056
93/41056
94/41056
95/41056
96/41056
97/41056
98/41056
99/41056
100/41056
101/41056
102/41056
103/41056
104/41056
105/41056
106/41056
107/41056
108/41056
109/41056
110/41056
111/4105

In [100]:
headlines_table.to_csv("./apple_finbert_twint_20100101_20220304.csv", index=False)
headlines_table.head()

Unnamed: 0,Headline,Stock,Positive,Negative,Neutral
0,Apple 2.0: Tablet: Big iPhone or thin MacBook?,2010-01-01,0.999959,3.629326e-05,5e-06
1,"Apple 2.0: Apple tablet to ship in March, sour...",2010-01-04,0.999997,1.389081e-07,3e-06
2,"Apple Tablet Will Be 10-11 Inches, Ships In Ma...",2010-01-04,0.999986,8.988271e-06,5e-06
3,CHART OF THE DAY: Android Taking Wind Out Of i...,2010-01-04,0.999992,3.523099e-06,4e-06
4,Apple 2.0: How many iPhones did Apple sell?,2010-01-04,0.999959,1.397303e-05,2.7e-05


In [101]:
test_table = pd.DataFrame(columns=["Headline", "Stock", "Positive", "Negative", "Neutral"])

In [102]:
# appended_data = []

# for i in range(3):
#     test_table.loc[  len(test_table)  ] = ["my", "new", "data", "row", "test"]

In [103]:
# test_table