In [2]:
import pandas as pd
import os

In [3]:
# Set the path to the folder containing the txt files
folder_path = os.path.join('src', 'data', 'ecb-speeches')

# List all txt files in the folder
txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]

# Initialize an empty DataFrame
df_speeches = pd.DataFrame(columns=['date', 'content'])

# Iterate through the txt files and read their content
for txt_file in txt_files:
    # Extract the date from the file name
    date_str = txt_file.split('_')[0]
    date = pd.to_datetime(date_str, format='%Y-%m-%d')
    
    # Read the content of the file
    with open(os.path.join(folder_path, txt_file), 'r') as file:
        content = file.read()
    
    # Append the date and content to the DataFrame
    df_speeches = pd.concat([df_speeches, pd.DataFrame({'date': [date], 'content': [content]})], ignore_index=True)

# Sort the DataFrame by date
df_speeches = df_speeches.sort_values(by='date').reset_index(drop=True)


In [7]:
def consolidate_lines(df):
    """
    Consolidate lines in the content of a dataframe. 
    If a line does not end with a full stop, it is merged 
    with the next line, removing unnecessary newline characters.
    
    Args:
        df (pandas.DataFrame): Dataframe with 'content' column
    
    Returns:
        pandas.DataFrame: Modified dataframe with consolidated lines
    """
    consolidated_data = []

    for index, row in df.iterrows():
        content = row['content']
        lines = content.split('\n')
        consolidated_content = ""
        for i, line in enumerate(lines):
            line = line.strip()
            if line and (line[-1] in ".!?" or i == len(lines) - 1):
                consolidated_content += line + " "
            else:
                consolidated_content += line

        consolidated_row = row.copy()
        consolidated_row['content'] = consolidated_content.strip()
        consolidated_data.append(consolidated_row)

    # Create new dataframe with consolidated content
    new_df = pd.DataFrame(consolidated_data)

    return new_df


import re

def split_paragraphs(df):
    """
    Splits content of dataframe into separate rows. 
    If a paragraph exceeds 512 words, it is limited to 
    two nearly equal parts, with the split done at a full stop.
    
    Args:
        df (pandas.DataFrame): Dataframe with 'date' and 'content' columns
    
    Returns:
        pandas.DataFrame: Modified dataframe with split content
    """
    data = []

    for index, row in df.iterrows():
        date = row['date']
        content = row['content']

        # Split content into sentences
        sentences = re.split(r'(?<=[.!?])\s+', content)

        # Process each sentence
        current_part = ""
        for sentence in sentences:
            if len(current_part.split()) + len(sentence.split()) <= 50:
                current_part += " " + sentence
            else:
                if current_part.strip():
                    data.append([date, current_part.strip()])
                current_part = sentence

        # Append remaining part if exists
        if current_part.strip():
            data.append([date, current_part.strip()])

    # Create new dataframe
    new_df = pd.DataFrame(data, columns=['date', 'content'])

    return new_df



In [8]:
df_speeches_consolidated = consolidate_lines(df_speeches)
df_speeches_consolidated = split_paragraphs(df_speeches_consolidated)
df_speeches_consolidated

Unnamed: 0,date,content
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...
1,1998-07-17,The general picture is one of continued econom...
2,1998-07-17,"As far as pricedevelopments are concerned, inf..."
3,1998-07-17,Economic growth has been driven increasingly b...
4,1998-07-17,The favourable conjunctural situation has star...
...,...,...
147049,2022-02-25,"Based on its experience, the ECB would strongl..."
147050,2022-02-25,"Also, considering the strict, risk-based natur..."
147051,2022-02-25,The most prominent element of the ECB opinions...
147052,2022-02-25,"While the proposed limit of €10,000 euro does ..."


In [9]:
# create counts column that counts words in content column of each row
df_speeches_consolidated['counts'] = df_speeches_consolidated['content'].str.split().str.len()

# remove observations with missing values and counts of less than 50
df_speeches_consolidated = df_speeches_consolidated[(df_speeches_consolidated['counts']>=20) & (df_speeches_consolidated['counts']<200) ]

## Import Transformers


In [10]:
from transformers import AutoTokenizer

# Define the tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)




In [11]:
import nltk
nltk.download('punkt')
from transformers import pipeline

import transformers

classifier = pipeline(
    "text-classification",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    tokenizer="distilbert-base-uncased",
)


[nltk_data] Downloading package punkt to /home/ozodbek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
import pandas as pd
from functions import calculate_sentiment_distilbert

max_chunk_length = 512

# Create an empty DataFrame
df = pd.DataFrame()

# add content
df['content'] = df_speeches_consolidated['content'].head(10)

# Apply the sentiment analysis function to the 'content' column for the first 10 rows
df[['positive', 'negative']] = df_speeches_consolidated['content'].head(10).apply(
    lambda x: pd.Series(calculate_sentiment_distilbert(x))
)
# Print the updated DataFrame
print(df)


                                             content  positive  negative
0  Mr. Duisenberg reports on the outcome of the s...  0.988099  0.011901
1  The general picture is one of continued econom...  0.960844  0.039156
2  As far as pricedevelopments are concerned, inf...  0.997458  0.002542
3  Economic growth has been driven increasingly b...  0.787363  0.212637
4  The favourable conjunctural situation has star...  0.815906  0.184094
5  As regards monetary and financial developments...  0.687710  0.312290
6  In principle, the economic performance I have ...  0.531834  0.468166
7  In this respect, I should like to underline th...  0.998245  0.001755
8  Second, most Member States need togo a step fu...  0.928188  0.071812
9  This implies that the benchmark for fiscalpoli...  0.995707  0.004293


In [13]:
# Define the start and end dates of the desired date range
start_date = '2008-09-01'
end_date = '2008-09-30'

# Filter the DataFrame based on the date range
subset_df = df_speeches[(df_speeches['date'] >= start_date) & (df_speeches['date'] <= end_date)]

# Apply the sentiment analysis function to the 'content' column for the subset
subset_df[['positive', 'negative']] = subset_df['content'].apply(lambda x: pd.Series(calculate_sentiment_distilbert(x)))

# Print the updated subset DataFrame
print(subset_df)


          date                                            content  positive  \
544 2008-09-03  Gertrude Tumpel-Gugerell: Start of the ECB/ESC...  0.983223   
545 2008-09-04  Gertrude Tumpel-Gugerell: Moving ahead with th...  0.976558   
546 2008-09-09  European Central Bank: Press conference – intr...  0.903439   
547 2008-09-09  Jürgen Stark: Monetary policy during the finan...  0.945200   
548 2008-09-09  Jean-Claude Trichet: Risk and the macro-econom...  0.931020   
549 2008-09-09  José Manuel González-Páramo: Globalisation, ma...  0.932746   
550 2008-09-10  Jürgen Stark: Economic perspectives and moneta...  0.931714   
551 2008-09-10  Gertrude Tumpel-Gugerell: What is the role of ...  0.976124   
552 2008-09-11  José Manuel González-Páramo: Some lessons from...  0.939504   
553 2008-09-11  Jean Claude-Trichet: Hearing before the Econom...  0.889569   
554 2008-09-11  Gertrude Tumpel-Gugerell: SEPA for cards\nSpee...  0.900189   
555 2008-09-15  Gertrude Tumpel-Gugerell: EU priorit

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df[['positive', 'negative']] = subset_df['content'].apply(lambda x: pd.Series(calculate_sentiment_distilbert(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df[['positive', 'negative']] = subset_df['content'].apply(lambda x: pd.Series(calculate_sentiment_distilbert(x)))


# Speeches with FinBERT

In [17]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

In [18]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')


In [19]:
from functions import split_text
from functions import calculate_sentiment_finbert

In [21]:
# Apply the sentiment analysis function to the 'content' column for the first N rows

N=2
subset = df_speeches_consolidated.head(10)
subset[['positive', 'negative']] = subset['content'].head(10).apply(lambda x: pd.Series(calculate_sentiment_finbert(x)))

# Optional: Calculate the neutral sentiment as the remaining probability
subset['neutral'] = 1 - subset['positive'] - subset['negative']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset[['positive', 'negative']] = subset['content'].head(10).apply(lambda x: pd.Series(calculate_sentiment_finbert(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset[['positive', 'negative']] = subset['content'].head(10).apply(lambda x: pd.Series(calculate_sentiment_finbert(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

In [22]:
subset

Unnamed: 0,date,content,counts,positive,negative,neutral
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...,50,0.937748,0.04171,0.020542
1,1998-07-17,The general picture is one of continued econom...,34,0.03419,0.034694,0.931116
2,1998-07-17,"As far as pricedevelopments are concerned, inf...",49,0.03053,0.951551,0.017919
3,1998-07-17,Economic growth has been driven increasingly b...,35,0.883557,0.097784,0.018659
4,1998-07-17,The favourable conjunctural situation has star...,49,0.101933,0.880079,0.017988
5,1998-07-17,As regards monetary and financial developments...,49,0.447888,0.532956,0.019157
6,1998-07-17,"In principle, the economic performance I have ...",45,0.489348,0.406243,0.104409
7,1998-07-17,"In this respect, I should like to underline th...",34,0.71949,0.179986,0.100524
8,1998-07-17,"Second, most Member States need togo a step fu...",29,0.036569,0.946498,0.016933
9,1998-07-17,This implies that the benchmark for fiscalpoli...,30,0.324513,0.065784,0.609703


# Starting with press releases


In [23]:
# Starting with press releases
import pandas as pd
press_releases = pd.read_csv("src/data/ecb_releases_302.csv")

In [24]:

# Convert date column to datetime format
press_releases['date'] = pd.to_datetime(press_releases['date'], format='%d %B %Y')

# Convert date to "DD-MM-YYYY" format
press_releases['date'] = press_releases['date'].dt.strftime('%d-%m-%Y')

# Remove new line characters, replace with space
press_releases['content'] = press_releases['content'].str.replace('\n', ' ')

press_releases

Unnamed: 0,date,content
0,10-08-2023,10 August 2023 Europa Open Air 2023 celebrates...
1,05-07-2023,5 July 2023 Compared with April 2023: consumer...
2,04-07-2023,4 July 2023 Credit terms and conditions tighte...
3,28-06-2023,28 June 2023 Proposed legislation establishes ...
4,22-06-2023,22 June 2023 The aggregate of total assets of ...
...,...,...
297,18-09-1998,The European Central Bank (ECB) will today pub...
298,18-09-1998,The Headquarters Agreement between the Governm...
299,12-09-1998,In accordance with the Resolution adopted by t...
300,12-09-1998,Given that the euro banknotes will be put into...


In [29]:
# press releases with distilbert

# Apply the sentiment analysis function to the 'content' column for the subset
press_releases[['positive', 'negative']] = press_releases['content'].apply(lambda x: pd.Series(calculate_sentiment_distilbert(x)))
press_releases

Unnamed: 0,date,content,positive,negative,neutral
0,10-08-2023,10 August 2023 Europa Open Air 2023 celebrates...,0.990885,0.009115,0.013838
1,05-07-2023,5 July 2023 Compared with April 2023: consumer...,0.985537,0.014463,0.581922
2,04-07-2023,4 July 2023 Credit terms and conditions tighte...,0.936402,0.063598,0.588270
3,28-06-2023,28 June 2023 Proposed legislation establishes ...,0.922678,0.077322,0.011897
4,22-06-2023,22 June 2023 The aggregate of total assets of ...,0.975960,0.024040,0.212809
...,...,...,...,...,...
297,18-09-1998,The European Central Bank (ECB) will today pub...,0.988458,0.011542,0.021108
298,18-09-1998,The Headquarters Agreement between the Governm...,0.976587,0.023413,0.012490
299,12-09-1998,In accordance with the Resolution adopted by t...,0.956106,0.043894,0.020652
300,12-09-1998,Given that the euro banknotes will be put into...,0.883076,0.116924,0.028728


In [26]:
from datetime import datetime

from transformers import BertForSequenceClassification, BertTokenizer
import torch
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

from functions import split_text
from functions import calculate_sentiment_finbert


In [27]:
"""
# NO NEED TO RUN FOR NOW, IT'S BEEN SAVED AND WILL BE IMPORTED IN THE NEXT STEP

subset = press_releases
subset[['positive', 'negative']] = subset['content'].apply(lambda x: pd.Series(calculate_sentiment_finbert(x)))

# Optional: Calculate the neutral sentiment as the remaining probability
subset['neutral'] = 1 - subset['positive'] - subset['negative']

# current time and date
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Define the filename with the current date and time
filename = f"src/output/finbert_sentiment_press_releases_{current_datetime}.csv"

# Save the DataFrame to the specified filename
subset.to_csv(filename, index=False)
"""

'\n# NO NEED TO RUN FOR NOW, IT\'S BEEN SAVED AND WILL BE IMPORTED IN THE NEXT STEP\n\nsubset = press_releases\nsubset[[\'positive\', \'negative\']] = subset[\'content\'].apply(lambda x: pd.Series(calculate_sentiment_finbert(x)))\n\n# Optional: Calculate the neutral sentiment as the remaining probability\nsubset[\'neutral\'] = 1 - subset[\'positive\'] - subset[\'negative\']\n\n# current time and date\ncurrent_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")\n\n# Define the filename with the current date and time\nfilename = f"src/output/finbert_sentiment_press_releases_{current_datetime}.csv"\n\n# Save the DataFrame to the specified filename\nsubset.to_csv(filename, index=False)\n'

In [28]:
# Read the saved CSV file into a DataFrame

press_releases = pd.read_csv("src/output/finbert_sentiment_press_releases_2023-08-22_13-22-32.csv")
press_releases

Unnamed: 0,date,content,positive,negative,neutral
0,10-08-2023,10 August 2023 Europa Open Air 2023 celebrates...,0.877308,0.108855,0.013838
1,05-07-2023,5 July 2023 Compared with April 2023: consumer...,0.338591,0.079486,0.581922
2,04-07-2023,4 July 2023 Credit terms and conditions tighte...,0.221073,0.190657,0.588270
3,28-06-2023,28 June 2023 Proposed legislation establishes ...,0.723583,0.264521,0.011897
4,22-06-2023,22 June 2023 The aggregate of total assets of ...,0.677040,0.110151,0.212809
...,...,...,...,...,...
297,18-09-1998,The European Central Bank (ECB) will today pub...,0.946169,0.032723,0.021108
298,18-09-1998,The Headquarters Agreement between the Governm...,0.738816,0.248694,0.012490
299,12-09-1998,In accordance with the Resolution adopted by t...,0.877338,0.102010,0.020652
300,12-09-1998,Given that the euro banknotes will be put into...,0.945670,0.025603,0.028728


# Stock market data


In [30]:
import yfinance as yf
import pandas as pd

# Define the path to your data file
data_path = "src/data/stoxx.txt"

# Read the data file into a DataFrame
vstoxx_df = pd.read_csv(data_path, delimiter=";", parse_dates=["Date"], dayfirst=True)

# Change the date format to DD-MM-YYYY
vstoxx_df["Date_merge"] = vstoxx_df["Date"].dt.strftime("%d-%m-%Y")

# generate day of the week column
vstoxx_df['day_of_week'] = pd.to_datetime(vstoxx_df['Date']).dt.day_name()

# generate month column
vstoxx_df['month'] = pd.to_datetime(vstoxx_df['Date']).dt.month_name()

# calculate the daily change in the index
vstoxx_df['delta_daily'] = vstoxx_df['Indexvalue'].diff()
vstoxx_df['delta_3d'] = vstoxx_df['Indexvalue'].diff(3)
vstoxx_df['delta_5d'] = vstoxx_df['Indexvalue'].diff(5)
vstoxx_df['delta_30d'] = vstoxx_df['Indexvalue'].diff(30)

# Display the DataFrame
print(vstoxx_df.head())


        Date Symbol  Indexvalue  Date_merge day_of_week    month  delta_daily  \
0 1999-01-04   V2TX     18.2033  04-01-1999      Monday  January          NaN   
1 1999-01-05   V2TX     29.6912  05-01-1999     Tuesday  January      11.4879   
2 1999-01-06   V2TX     25.1670  06-01-1999   Wednesday  January      -4.5242   
3 1999-01-07   V2TX     32.5205  07-01-1999    Thursday  January       7.3535   
4 1999-01-08   V2TX     33.2296  08-01-1999      Friday  January       0.7091   

   delta_3d  delta_5d  delta_30d  
0       NaN       NaN        NaN  
1       NaN       NaN        NaN  
2       NaN       NaN        NaN  
3   14.3172       NaN        NaN  
4    3.5384       NaN        NaN  


In [31]:
# Merge the press release dates into the vstoxx_df DataFrame
vstoxx_df = pd.merge(vstoxx_df, press_releases[['date']], how='left', left_on='Date_merge', right_on='date')

# Calculate the percentage change in the index
vstoxx_df['Percentage_Change'] = (vstoxx_df['Indexvalue'] - vstoxx_df['Indexvalue'].shift(1)) / vstoxx_df['Indexvalue'].shift(1)
vstoxx_df

Unnamed: 0,Date,Symbol,Indexvalue,Date_merge,day_of_week,month,delta_daily,delta_3d,delta_5d,delta_30d,date,Percentage_Change
0,1999-01-04,V2TX,18.2033,04-01-1999,Monday,January,,,,,04-01-1999,
1,1999-01-05,V2TX,29.6912,05-01-1999,Tuesday,January,11.4879,,,,05-01-1999,0.631089
2,1999-01-05,V2TX,29.6912,05-01-1999,Tuesday,January,11.4879,,,,05-01-1999,0.000000
3,1999-01-06,V2TX,25.1670,06-01-1999,Wednesday,January,-4.5242,,,,,-0.152375
4,1999-01-07,V2TX,32.5205,07-01-1999,Thursday,January,7.3535,14.3172,,,07-01-1999,0.292188
...,...,...,...,...,...,...,...,...,...,...,...,...
6304,2023-08-15,V2TX,18.8579,15-08-2023,Tuesday,August,0.8050,0.7815,-0.6814,4.7564,,0.044591
6305,2023-08-16,V2TX,18.5510,16-08-2023,Wednesday,August,-0.3069,0.1764,-0.4286,3.2565,,-0.016274
6306,2023-08-17,V2TX,20.3539,17-08-2023,Thursday,August,1.8029,2.3010,2.2775,1.1262,,0.097186
6307,2023-08-18,V2TX,20.2456,18-08-2023,Friday,August,-0.1083,1.3877,1.8710,2.7082,,-0.005321


In [32]:
# Bond prices

# Define the ticker symbol for the Eurozone bond you're interested in
bond_ticker = [""]

# Define the start and end dates for the data you want to fetch
start_date = "2000-01-01"
end_date = "2023-08-01"

# Fetch the bond price data using yfinance
bond_data = yf.download(bond_ticker, start=start_date, end=end_date)

# remove the 1st row for Date, and keep it in 0th row
bond_data.reset_index(inplace=True)

# keep only date and adjusted close columns
bond_data = bond_data[['Date','Adj Close']]
bond_data.columns = ['Date', 'bond_price']

# generate more variables
bond_data['b_delta_daily'] = bond_data['bond_price'].diff()
bond_data['b_delta_3d'] = bond_data['bond_price'].diff(3)
bond_data['b_delta_5d'] = bond_data['bond_price'].diff(5)
bond_data['b_delta_30d'] = bond_data['bond_price'].diff(30)

# Display the downloaded data
bond_data 

[*********************100%***********************]  1 of 1 completed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bond_data['b_delta_daily'] = bond_data['bond_price'].diff()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bond_data['b_delta_3d'] = bond_data['bond_price'].diff(3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bond_data['b_delta_5d'] = bond_data['bond_price'].diff(5)
A value is trying to be set o

Unnamed: 0,Date,bond_price,b_delta_daily,b_delta_3d,b_delta_5d,b_delta_30d
0,2014-10-23,20.524277,,,,
1,2014-10-24,20.475508,-0.048769,,,
2,2014-10-27,20.343830,-0.131678,,,
3,2014-10-28,20.735615,0.391785,0.211338,,
4,2014-10-29,20.865675,0.130060,0.390167,,
...,...,...,...,...,...,...
2201,2023-07-25,30.100000,0.130001,-0.049999,-0.309999,1.268011
2202,2023-07-26,30.080000,-0.020000,-0.070000,-0.240000,0.866453
2203,2023-07-27,30.000000,-0.080000,0.030001,-0.150000,0.590784
2204,2023-07-28,30.490000,0.490000,0.389999,0.340000,0.982948


# Import FinBERT sentiments


In [33]:
finbert_releases = pd.read_csv("src/output/finbert_sentiment_press_releases_2023-08-22_13-22-32.csv")
finbert_releases

# change the date format to YYYY-MM-DD
finbert_releases['date'] = pd.to_datetime(finbert_releases['date'], format='%d-%m-%Y')

# merge the finbert releases with the bond data
finbert_releases = pd.merge(finbert_releases, bond_data, how='left', left_on='date', right_on='Date')

# Merge the press release dates into the vstoxx_df DataFrame
finbert_releases = pd.concat([finbert_releases, vstoxx_df[['Percentage_Change', 'delta_daily', 'delta_3d', 'delta_5d', 'delta_30d']]], axis=1)

In [None]:
# data with no nans
restricted_sample = finbert_releases.dropna()
restricted_sample