# Warning
## This notebook can only be used to see how the sentiments are extracted. The process shows how FinBERT model is utilized for this study using high-performance GPUs. It took a few hours on 40gb Tesla T4 GPU, and this is not comparable performance against personal laptops. Please be careful, and take this into consideration.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    print(f'GPU Type: {torch.cuda.get_device_name(0)}')
else:
    print("No GPU available.")


GPU Type: NVIDIA A100-SXM4-40GB


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.7 MB/s[0m eta [36m0:00:

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import pandas as pd
import numpy as np

In [None]:
data = pd.read_parquet("/content/drive/MyDrive/Thesis/ecb-speeches.parquet")
data

Unnamed: 0,date,content
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...
1,1998-09-18,Mr. Duisenberg’s opening statement at the meet...
2,1998-10-12,Mr. Duisenberg speaks on changes in European f...
3,1998-10-12,Mr. Duisenberg's statement to the European Par...
4,1998-10-22,Mr. Duisenberg’s opening statement at the pres...
...,...,...
2250,2022-02-15,Christine Lagarde: 20th anniversary of the ent...
2251,2022-02-17,Christine Lagarde: Introductory statement - Eu...
2252,2022-02-23,Frank Elderson: Prudential pathways to Paris C...
2253,2022-02-23,Frank Elderson: Towards an immersive superviso...


In [None]:
import pandas as pd

def consolidate_lines(df):
    """
    Consolidate lines in the content of a dataframe.
    If a line does not end with a full stop, it is merged
    with the next line, removing unnecessary newline characters.

    Args:
        df (pandas.DataFrame): Dataframe with 'content' column

    Returns:
        pandas.DataFrame: Modified dataframe with consolidated lines
    """
    consolidated_data = []

    for index, row in df.iterrows():
        content = row['content']
        lines = content.split('\n')
        consolidated_content = ""
        for i, line in enumerate(lines):
            line = line.strip()
            if line and (line[-1] in ".!?" or i == len(lines) - 1):
                consolidated_content += line + " "  # Append a space after each line
            else:
                consolidated_content += line + " "  # Append a space to separate lines

        consolidated_row = row.copy()
        consolidated_row['content'] = consolidated_content.strip()
        consolidated_data.append(consolidated_row)

    # Create new dataframe with consolidated content
    new_df = pd.DataFrame(consolidated_data)

    return new_df



import re

def split_paragraphs(df):
    """
    Splits content of dataframe into separate rows.
    If a paragraph exceeds 512 words, it is limited to
    two nearly equal parts, with the split done at a full stop.

    Args:
        df (pandas.DataFrame): Dataframe with 'date' and 'content' columns

    Returns:
        pandas.DataFrame: Modified dataframe with split content
    """
    data = []

    for index, row in df.iterrows():
        date = row['date']
        content = row['content']

        # Split content into sentences
        sentences = re.split(r'(?<=[.!?])\s+', content)

        # Process each sentence
        current_part = ""
        for sentence in sentences:
            if len(current_part.split()) + len(sentence.split()) <= 200:
                current_part += " " + sentence
            else:
                if current_part.strip():
                    data.append([date, current_part.strip()])
                current_part = sentence

        # Append remaining part if exists
        if current_part.strip():
            data.append([date, current_part.strip()])

    # Create new dataframe
    new_df = pd.DataFrame(data, columns=['date', 'content'])

    return new_df

# clean the content column and remove stopwords
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Replace newline characters with space
    text = text.replace('\n', ' ')

    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Strip extra whitespaces
    text = ' '.join(text.split())

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import re
def split_paragraphs(df, tokenizer):
    """
    Splits content of dataframe into separate rows.
    If a paragraph exceeds the token threshold, it tries to split
    at a sentence boundary.

    Args:
        df (pandas.DataFrame): Dataframe with 'date' and 'content' columns
        tokenizer: The tokenizer used for the transformer model

    Returns:
        pandas.DataFrame: Modified dataframe with split content
    """
    data = []
    MAX_TOKENS = 512 - 2  # -2 to account for [CLS] and [SEP] tokens
    MAX_WORDS_BUFFER = 150  # Starting point, can be adjusted based on testing

    for index, row in df.iterrows():
        date = row['date']
        content = row['content']

        # Split content into sentences
        sentences = re.split(r'(?<=[.!?])\s+', content)

        # Process each sentence
        current_part = ""
        for sentence in sentences:
            if len(current_part.split()) + len(sentence.split()) <= MAX_WORDS_BUFFER:
                current_part += " " + sentence
            else:
                # Check token count and adjust if needed
                if len(tokenizer.tokenize(current_part)) <= MAX_TOKENS:
                    data.append([date, current_part.strip()])
                    current_part = sentence
                else:  # Handle special case where even one sentence is too long
                    truncated_sentences = current_part.split()[:MAX_WORDS_BUFFER]
                    data.append([date, ' '.join(truncated_sentences)])
                    current_part = ' '.join(current_part.split()[MAX_WORDS_BUFFER:]) + sentence

        # Append remaining part if exists and is under token count
        if current_part.strip() and len(tokenizer.tokenize(current_part)) <= MAX_TOKENS:
            data.append([date, current_part.strip()])

    # Create new dataframe
    new_df = pd.DataFrame(data, columns=['date', 'content'])

    return new_df


In [None]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
data_split = split_paragraphs(data, tokenizer)


In [None]:
data_split

Unnamed: 0,date,content
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...
1,1998-07-17,Economic growth has been driven increasingly b...
2,1998-07-17,"In principle, the economic performance I have ..."
3,1998-07-17,"Third, further structural adjustments in fisca..."
4,1998-07-17,There is currently no sign of exchange rate te...
...,...,...
46077,2022-02-25,Since the exchange of confidential information...
46078,2022-02-25,"Last year, following discussions in the Counci..."
46079,2022-02-25,This type of measure has so far only been impl...
46080,2022-02-25,The ECB incorporated in its opinions its exper...


In [None]:
#subset = data.sample(n=10)
subset = data_split.copy()
subset

Unnamed: 0,date,content
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...
1,1998-07-17,Economic growth has been driven increasingly b...
2,1998-07-17,"In principle, the economic performance I have ..."
3,1998-07-17,"Third, further structural adjustments in fisca..."
4,1998-07-17,There is currently no sign of exchange rate te...
...,...,...
46077,2022-02-25,Since the exchange of confidential information...
46078,2022-02-25,"Last year, following discussions in the Counci..."
46079,2022-02-25,This type of measure has so far only been impl...
46080,2022-02-25,The ECB incorporated in its opinions its exper...


In [None]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="ProsusAI/finbert")

# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")



In [None]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

# Function to get the token count
def token_count(content):
    return len(tokenizer.tokenize(content))

# Apply the function and filter out content with more than 512 tokens
subset = subset[subset['content'].apply(token_count) < 512]


In [None]:
subset

Unnamed: 0,date,content
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...
1,1998-07-17,Economic growth has been driven increasingly b...
2,1998-07-17,"In principle, the economic performance I have ..."
3,1998-07-17,"Third, further structural adjustments in fisca..."
4,1998-07-17,There is currently no sign of exchange rate te...
...,...,...
46077,2022-02-25,Since the exchange of confidential information...
46078,2022-02-25,"Last year, following discussions in the Counci..."
46079,2022-02-25,This type of measure has so far only been impl...
46080,2022-02-25,The ECB incorporated in its opinions its exper...


In [None]:
# Using 'pipe'
def extract_sentiment(text):
    result = pipe(text)[0]
    return result['label'], result['score']

# Apply the function to the 'content_clean' column and unpack the results into two new columns
subset['sentiment'], subset['probability'] = zip(*subset['content'].apply(extract_sentiment))
subset


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['sentiment'], subset['probability'] = zip(*subset['content'].apply(extract_sentiment))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['sentiment'], subset['probability'] = zip(*subset['content'].apply(extract_sentiment))


Unnamed: 0,date,content,sentiment,probability
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...,positive,0.854208
1,1998-07-17,Economic growth has been driven increasingly b...,positive,0.947828
2,1998-07-17,"In principle, the economic performance I have ...",neutral,0.600082
3,1998-07-17,"Third, further structural adjustments in fisca...",neutral,0.824068
4,1998-07-17,There is currently no sign of exchange rate te...,neutral,0.756298
...,...,...,...,...
46077,2022-02-25,Since the exchange of confidential information...,positive,0.664321
46078,2022-02-25,"Last year, following discussions in the Counci...",neutral,0.762738
46079,2022-02-25,This type of measure has so far only been impl...,neutral,0.897179
46080,2022-02-25,The ECB incorporated in its opinions its exper...,neutral,0.868921


In [None]:
subset.to_csv("/content/drive/MyDrive/Thesis/speech_all_finbert.csv")

In [None]:
subset['sentiment'].value_counts()

neutral     29990
positive     8699
negative     7330
Name: sentiment, dtype: int64

# After the first runtime, there is no need to run the whole sentiment extraction process. It can simply be imported from src/Data/speech_all_finbert.csv