# Explaination about this Notebook

In this notebook, we applied the sentient on the news text. As a sentiment model we use VADER and FinancialBERT.

VADER was trained on social media comments where as FinancialBERT was trained on news texts in different languages. 
The big benefit of the FinancialBERT, is that the sentiment can be applied on the GPU, accelerating the speed of the code.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
from deep_translator import GoogleTranslator, DeeplTranslator, LibreTranslator
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from pathlib import Path
import os
import time


from transformers import pipeline
from tqdm import tqdm
import torch
import nltk
from nltk.tokenize import sent_tokenize
from transformers import pipeline
nltk.download('punkt_tab')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/generalpegasus/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/generalpegasus/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
translator = GoogleTranslator(source='de', target='en')
vader = SentimentIntensityAnalyzer()


In [4]:
file = Path(os.path.join('..', 'data', 'news','all_news_text_en.csv'))
df = pd.read_csv(file, index_col=0)
display(df.head(3))
print(df.shape)

Unnamed: 0,date,news_text,headline,article_url,en_news_text
16905,2017-10-18,FRANKFURT (Dow Jones)--Die Deutsche Börse zieh...,Deutsche Börse will Anfang 2018 Scale-Auswahli...,https://www.finanzen.net/nachricht/aktien/deut...,
16899,2017-10-18,WIEN (dpa-AFX) - Die Wiener Börse hat am Mittw...,Aktien Wien Schluss: ATX erstmals seit 2008 üb...,https://www.finanzen.net/nachricht/aktien/akti...,FRANKFURT (dpa-AFX) - After a continued record...
16900,2017-10-18,FRANKFURT (dpa-AFX) - Die Anleger am deutschen...,ROUNDUP/Aktien Frankfurt Schluss: Wall Street ...,https://www.finanzen.net/nachricht/aktien/roun...,Keep an eye on the entire current trading day ...


(16906, 5)


In [5]:
display(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 16906 entries, 16905 to 37
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          16905 non-null  object
 1   news_text     16905 non-null  object
 2   headline      16905 non-null  object
 3   article_url   16905 non-null  object
 4   en_news_text  16905 non-null  object
dtypes: object(5)
memory usage: 792.5+ KB


None

In [6]:
print(df.isnull().sum())
print(df.isna().sum())
df = df.dropna()

date            1
news_text       1
headline        1
article_url     1
en_news_text    1
dtype: int64
date            1
news_text       1
headline        1
article_url     1
en_news_text    1
dtype: int64


In [7]:
df['vader_compound'] = df['en_news_text'].apply(lambda x: vader.polarity_scores(x)['compound'])
df['vader_pos'] = df['en_news_text'].apply(lambda x: vader.polarity_scores(x)['pos'])
df['vader_neg'] = df['en_news_text'].apply(lambda x: vader.polarity_scores(x)['neg'])
df['accumulated_vader_compound'] = df['vader_compound'].cumsum()

In [8]:

# display(df.head(3))
# group by day and calculate the mean of the vader_compound, vader_pos, vader_neg and accumulated_vader_compound


df_grouped_by_date = df.groupby('date').agg({'vader_compound': 'mean', 
                                             'vader_pos': 'mean', 
                                             'vader_neg': 'mean',
                                             'accumulated_vader_compound': 'mean'}).reset_index()

# just display year month day

df_grouped_by_date.sort_values(by='date', ascending=True, inplace=True)
df_grouped_by_date.head(3)


Unnamed: 0,date,vader_compound,vader_pos,vader_neg,accumulated_vader_compound
0,2017-10-18,0.776267,0.097333,0.044333,2.5474
1,2017-10-19,0.62572,0.0894,0.0525,7.7103
2,2017-10-20,0.827444,0.099556,0.052889,14.677222


In [9]:
start_date = df_grouped_by_date['date'].min()

In [11]:
df_grouped_by_date['date'] = pd.to_datetime(df_grouped_by_date['date'])
print(df_grouped_by_date.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1886 entries, 0 to 1885
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   date                        1886 non-null   datetime64[ns]
 1   vader_compound              1886 non-null   float64       
 2   vader_pos                   1886 non-null   float64       
 3   vader_neg                   1886 non-null   float64       
 4   accumulated_vader_compound  1886 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 73.8 KB
None


In [12]:


# if cuda is available use it
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('Using GPU')
else:
    device = torch.device('cpu')
    print('Using CPU')

pipe = pipeline("text-classification", model="ahmedrachid/FinancialBERT-Sentiment-Analysis", device=device)


Using GPU


In [13]:
# Funktion, um den Text in kleinere Abschnitte aufzuteilen
def split_text(text, max_length=512):
    sentences = sent_tokenize(text)
    splits = []
    current_split = ""
    
    for sentence in sentences:
        if len(current_split) + len(sentence) + 1 <= max_length:
            current_split += " " + sentence
        else:
            splits.append(current_split.strip())
            current_split = sentence
    
    if current_split:
        splits.append(current_split.strip())
    
    return splits

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    text = row['en_news_text']
    
    if len(text) > 512:
        splitted_text = split_text(text)
        sentiment = []
        for split in splitted_text:
            if len(split) > 512:
                split = split[:512]  # Truncate to 512 tokens if still too long, can happend if text has no sentence end
            sentiment.append(pipe(split))
        df.loc[i, 'financial_bert'] = sentiment
    else:
        df.loc[i, 'financial_bert'] = pipe(text)

  0%|          | 1/16904 [00:00<1:57:26,  2.40it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 16904/16904 [36:12<00:00,  7.78it/s] 


In [14]:
# get the label and the score in separate columns
df['financial_bert_label'] = df['financial_bert'].apply(lambda x: x[0]['label'])
df['financial_bert_score'] = df['financial_bert'].apply(lambda x: x[0]['score'])

In [15]:
# set values for label, negative = -1, neutral = 0, positive = 1
df['financial_bert_numeric_label'] = df['financial_bert_label'].map({'negative': -1, 'neutral': 0, 'positive': 1})

In [16]:

# save data
df.date = pd.to_datetime(df.date)
df.sort_values(by='date', ascending=True)
df.to_csv('finance_news_with_sentiment.csv', index=True)
display(df.head(3))

Unnamed: 0,date,news_text,headline,article_url,en_news_text,vader_compound,vader_pos,vader_neg,accumulated_vader_compound,financial_bert,financial_bert_label,financial_bert_score,financial_bert_numeric_label
16899,2017-10-18,WIEN (dpa-AFX) - Die Wiener Börse hat am Mittw...,Aktien Wien Schluss: ATX erstmals seit 2008 üb...,https://www.finanzen.net/nachricht/aktien/akti...,FRANKFURT (dpa-AFX) - After a continued record...,0.9785,0.107,0.068,0.9785,"[{'label': 'negative', 'score': 0.983578324317...",negative,0.983578,-1
16900,2017-10-18,FRANKFURT (dpa-AFX) - Die Anleger am deutschen...,ROUNDUP/Aktien Frankfurt Schluss: Wall Street ...,https://www.finanzen.net/nachricht/aktien/roun...,Keep an eye on the entire current trading day ...,-0.0112,0.049,0.046,0.9673,"[{'label': 'neutral', 'score': 0.9373340010643...",neutral,0.937334,0
16904,2017-10-18,FRANKFURT (dpa-AFX) - Der deutsche Aktienmark ...,Aktien Frankfurt: Dax und MDax schaffen Bestma...,https://www.finanzen.net/nachricht/aktien/akti...,FRANKFURT (dpa-AFX) - The DAX started with gai...,0.9863,0.137,0.034,1.9536,"[{'label': 'positive', 'score': 0.999781310558...",positive,0.999781,1
