In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [47]:
speeches = pd.read_parquet('src/Data/ecb-speeches.parquet')
releases = pd.read_csv("src/Data/ecb_releases_302.csv")
announcements = pd.read_csv("src/Data/policy_announcements.csv")

In [48]:
pd.set_option('display.max_colwidth', 50)

all_data = pd.concat([speeches, releases, announcements], ignore_index=True)
all_data

Unnamed: 0,date,content
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...
1,1998-09-18,Mr. Duisenberg’s opening statement at the meet...
2,1998-10-12,Mr. Duisenberg speaks on changes in European f...
3,1998-10-12,Mr. Duisenberg's statement to the European Par...
4,1998-10-22,Mr. Duisenberg’s opening statement at the pres...
...,...,...
2846,8 May 2014,"At today’s meeting, which was held in Brussels..."
2847,3 April 2014,At today’s meeting the Governing Council of th...
2848,6 March 2014,At today’s meeting the Governing Council of th...
2849,6 February 2014,At today’s meeting the Governing Council of th...


In [82]:
all_data = pd.read_parquet("src/Output/10k_labeled_binary.parquet")
all_data

Unnamed: 0,index,date,content,content_clean,is_english,predicted_label,sentiment_0shot
0,4128,2006-06-13,Monetary policy strat...,monetary policy strat...,True,Economic or financial...,Negative
1,21695,2017-09-27,"More broadly, it just...",broadly justified tak...,True,financial markets are...,Positive
2,12640,2011-10-24,The standard and most...,standard important to...,True,Economy is stable and...,Positive
3,20508,2017-02-17,Enduring European int...,enduring european int...,True,Inflation is rising,Negative
4,2273,2003-11-27,Lucas D Papademos: Ch...,lucas papademos chall...,True,Economy is under pres...,Negative
...,...,...,...,...,...,...,...
9995,1215,2000-11-03,Let me briefly touch ...,let briefly touch upo...,True,unemployment and infl...,Positive
9996,6259,2008-03-19,The Lisbon process en...,lisbon process enable...,True,financial markets are...,Negative
9997,2458,2004-04-23,Such alternative appr...,alternative approache...,True,financial markets are...,Negative
9998,21591,2017-09-14,A large body of the l...,large body literature...,True,financial markets are...,Positive


In [83]:
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Replace newline characters with space
    text = text.replace('\n', ' ')

    # Replace hyphenated words with underscore
    text = ' '.join([word.replace('-', '_') if '-' in word else word for word in text.split()])
    
    # Replace punctuation with space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    text = text.translate(translator)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Strip extra whitespaces
    text = ' '.join(text.split())
    
    return text

all_data['content_clean'] = all_data['content'].apply(clean_text)
all_data


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ozodbek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,index,date,content,content_clean,is_english,predicted_label,sentiment_0shot
0,4128,2006-06-13,Monetary policy strat...,monetary policy strat...,True,Economic or financial...,Negative
1,21695,2017-09-27,"More broadly, it just...",broadly justified tak...,True,financial markets are...,Positive
2,12640,2011-10-24,The standard and most...,standard important to...,True,Economy is stable and...,Positive
3,20508,2017-02-17,Enduring European int...,enduring european int...,True,Inflation is rising,Negative
4,2273,2003-11-27,Lucas D Papademos: Ch...,lucas papademos chall...,True,Economy is under pres...,Negative
...,...,...,...,...,...,...,...
9995,1215,2000-11-03,Let me briefly touch ...,let briefly touch upo...,True,unemployment and infl...,Positive
9996,6259,2008-03-19,The Lisbon process en...,lisbon process enable...,True,financial markets are...,Negative
9997,2458,2004-04-23,Such alternative appr...,alternative approache...,True,financial markets are...,Negative
9998,21591,2017-09-14,A large body of the l...,large body literature...,True,financial markets are...,Positive


In [81]:
import pandas as pd

def consolidate_lines(df):
    """
    Consolidate lines in the content of a dataframe.
    If a line does not end with a full stop, it is merged
    with the next line, removing unnecessary newline characters.

    Args:
        df (pandas.DataFrame): Dataframe with 'content' column

    Returns:
        pandas.DataFrame: Modified dataframe with consolidated lines
    """
    consolidated_data = []

    for index, row in df.iterrows():
        content = row['content']
        lines = content.split('\n')
        consolidated_content = ""
        for i, line in enumerate(lines):
            line = line.strip()
            if line and (line[-1] in ".!?" or i == len(lines) - 1):
                consolidated_content += line + " "  # Append a space after each line
            else:
                consolidated_content += line + " "  # Append a space to separate lines

        consolidated_row = row.copy()
        consolidated_row['content'] = consolidated_content.strip()
        consolidated_data.append(consolidated_row)

    # Create new dataframe with consolidated content
    new_df = pd.DataFrame(consolidated_data)

    return new_df



import re

def split_paragraphs(df):
    """
    Splits content of dataframe into separate rows. 
    If a paragraph exceeds 512 words, it is limited to 
    two nearly equal parts, with the split done at a full stop.
    
    Args:
        df (pandas.DataFrame): Dataframe with 'date' and 'content' columns
    
    Returns:
        pandas.DataFrame: Modified dataframe with split content
    """
    data = []

    for index, row in df.iterrows():
        date = row['date']
        content = row['content']

        # Split content into sentences
        sentences = re.split(r'(?<=[.!?])\s+', content)

        # Process each sentence
        current_part = ""
        for sentence in sentences:
            if len(current_part.split()) + len(sentence.split()) <= 200:
                current_part += " " + sentence
            else:
                if current_part.strip():
                    data.append([date, current_part.strip()])
                current_part = sentence

        # Append remaining part if exists
        if current_part.strip():
            data.append([date, current_part.strip()])

    # Create new dataframe
    new_df = pd.DataFrame(data, columns=['date', 'content'])

    return new_df

# clean the content column and remove stopwords
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Replace newline characters with space
    text = text.replace('\n', ' ')
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Strip extra whitespaces
    text = ' '.join(text.split())
    
    return text

all_data = consolidate_lines(all_data)
all_data = split_paragraphs(all_data)
all_data['content_clean'] = all_data['content'].apply(clean_text)
all_data

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ozodbek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,date,content,content_clean
0,2006-06-13,Monetary policy strat...,monetary policy strat...
1,2006-06-13,We have found the sel...,found selection annou...
2,2017-09-27,"More broadly, it just...",broadly justified tak...
3,2017-09-27,This brings me to the...,brings third phase cr...
4,2011-10-24,The standard and most...,standard important to...
...,...,...,...
19013,2004-04-23,In their simplest for...,simplest forms rules ...
19014,2017-09-14,A large body of the l...,large body literature...
19015,2017-09-14,This notion goes back...,notion goes back leas...
19016,2008-09-10,The euro area is curr...,euro area currently p...


In [16]:
all_data.to_parquet('src/Output/all_data.parquet')


In [80]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce MX130


In [63]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  0


In [64]:
import regex
# eliminate non-tokenizable characters

def isEnglish(text):
    # This pattern matches characters that belong to various common scripts, including Latin, Greek, Cyrillic, etc.
    # It also matches common punctuation, numbers, and spaces.
    pattern = r'^[\p{Script=Latin}\p{Script=Greek}\p{Script=Cyrillic}\p{Number}\p{Punctuation}\p{Zs}]+$'

    return bool(regex.match(pattern, text))

text = 'axa'
isEnglish(text)


True

In [116]:
#all_data['is_english'] = all_data['content_clean'].apply(isEnglish)
#all_data

In [125]:
all_data = all_data[all_data['is_english']==True]
#all_data_sample = all_data.sample(n=1000, random_state=12)
all_data_sample = all_data.iloc[5001:7000]
all_data_sample

Unnamed: 0,index,date,content,content_clean,is_english,predicted_label,sentiment_0shot
5001,27054,5 May 2023,The unemployment rate...,unemployment rate exp...,True,financial markets are...,Positive
5002,7661,2008-11-21,In spite of the overa...,spite overall trend t...,True,Inflation is rising,Negative
5003,16487,2014-05-14,The most effective wa...,effective way account...,True,financial markets are...,Positive
5004,10027,2010-04-14,"2 Likewise, economic ...",2 likewise economic f...,True,unemployment and infl...,Positive
5005,13588,2012-07-23,Putting an end to cer...,putting end certain s...,True,financial markets are...,Negative
...,...,...,...,...,...,...,...
6995,17139,2014-09-29,And as this process w...,process take place ba...,True,financial markets are...,Positive
6996,19829,2016-06-24,Yves Mersch: Monetary...,yves mersch monetary ...,True,unemployment and infl...,Positive
6997,21138,2017-06-09,This makes the market...,makes market even com...,True,unemployment and infl...,Positive
6998,17606,2015-02-02,If inflation in the e...,inflation euro area r...,True,financial markets are...,Positive


In [126]:
import torch
print(torch.cuda.is_available())


True


In [127]:
from transformers import BertForSequenceClassification, BertTokenizer, pipeline

# Use a GPU device if available
device = 0 if torch.cuda.is_available() else -1

model_name = "bert-base-uncased"
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

nlp_classifier = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer, device=0)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [128]:
import pandas as pd
all_data_sample['content'] = all_data_sample['content'].astype(str)
all_data_sample = all_data_sample.dropna(subset=['content'])
all_data_sample

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data_sample['content'] = all_data_sample['content'].astype(str)


Unnamed: 0,index,date,content,content_clean,is_english,predicted_label,sentiment_0shot
5001,27054,5 May 2023,The unemployment rate...,unemployment rate exp...,True,financial markets are...,Positive
5002,7661,2008-11-21,In spite of the overa...,spite overall trend t...,True,Inflation is rising,Negative
5003,16487,2014-05-14,The most effective wa...,effective way account...,True,financial markets are...,Positive
5004,10027,2010-04-14,"2 Likewise, economic ...",2 likewise economic f...,True,unemployment and infl...,Positive
5005,13588,2012-07-23,Putting an end to cer...,putting end certain s...,True,financial markets are...,Negative
...,...,...,...,...,...,...,...
6995,17139,2014-09-29,And as this process w...,process take place ba...,True,financial markets are...,Positive
6996,19829,2016-06-24,Yves Mersch: Monetary...,yves mersch monetary ...,True,unemployment and infl...,Positive
6997,21138,2017-06-09,This makes the market...,makes market even com...,True,unemployment and infl...,Positive
6998,17606,2015-02-02,If inflation in the e...,inflation euro area r...,True,financial markets are...,Positive


In [129]:
# Convert into list
content_list = all_data_sample['content_clean'].tolist()

# Define the candidate labels
candidate_labels = ["Negative outlook for the economy",
                    "Positive outlook for the economy",
                    "General speech about the economy and no clear sentiment or message",
                    ]

# Perform zero-shot classification
results = nlp_classifier(content_list, candidate_labels)

# Extract the predicted labels
predicted_labels = [result['labels'][0] for result in results]

# Add the predicted labels to the DataFrame
all_data_sample['predicted_label'] = predicted_labels




In [130]:
all_data_sample['predicted_label'].value_counts()

predicted_label
General speech about the economy and no clear sentiment or message    1159
Negative outlook for the economy                                       545
Positive outlook for the economy                                       295
Name: count, dtype: int64

In [131]:
sent_dic = {
    "Positive outlook for the economy":"Positive",
    "Negative outlook for the economy":"Negative",
    "General speech about the economy and no clear sentiment or message":"Neutral"
}
all_data_sample['sentiment_0shot_binary'] = all_data_sample['predicted_label'].map(sent_dic)

In [132]:
all_data_sample.to_csv('5000_7000_0shot.csv')

In [75]:
pd.set_option('display.max_colwidth', 25)
all_data_sample[all_data_sample["sentiment_0shot_binary"]=="Negative"]

Unnamed: 0,date,content,content_clean,is_english,predicted_label,sentiment_0shot_binary
16253,2014-10-20,Since 2009 Greece has...,since 2009 greece alm...,True,Negative outlook for ...,Negative
6089,2007-08-29,An influential view o...,influential view poor...,True,Negative outlook for ...,Negative
11738,2011-02-22,The global financial ...,global financial cris...,True,Negative outlook for ...,Negative
3291,2004-06-04,Otmar Issing: The eur...,otmar issing euro lis...,True,Negative outlook for ...,Negative
20561,2018-04-24,This continuous deman...,continuous demand pre...,True,Negative outlook for ...,Negative
6349,2007-10-19,As a French citizen a...,french citizen europe...,True,Negative outlook for ...,Negative
16865,2015-05-22,This is by no means t...,means end challenges ...,True,Negative outlook for ...,Negative
22649,2020-08-27,"On balance, the posit...",balance positive effe...,True,Negative outlook for ...,Negative
19832,2017-09-14,One of them – the exc...,one – exchange rate –...,True,Negative outlook for ...,Negative
3019,2004-04-23,"In the end, what matt...",end matters europe re...,True,Negative outlook for ...,Negative
