In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [13]:
speeches = pd.read_parquet('src/Data/ecb-speeches.parquet')
releases = pd.read_csv("src/Data/ecb_releases_302.csv")
announcements = pd.read_csv("src/Data/policy_announcements.csv")

In [14]:
all_data = pd.concat([speeches, releases, announcements], ignore_index=True)
all_data

Unnamed: 0,date,content
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...
1,1998-09-18,Mr. Duisenberg’s opening statement at the meet...
2,1998-10-12,Mr. Duisenberg speaks on changes in European f...
3,1998-10-12,Mr. Duisenberg's statement to the European Par...
4,1998-10-22,Mr. Duisenberg’s opening statement at the pres...
...,...,...
2846,8 May 2014,"At today’s meeting, which was held in Brussels..."
2847,3 April 2014,At today’s meeting the Governing Council of th...
2848,6 March 2014,At today’s meeting the Governing Council of th...
2849,6 February 2014,At today’s meeting the Governing Council of th...


In [9]:
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Replace newline characters with space
    text = text.replace('\n', ' ')
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Strip extra whitespaces
    text = ' '.join(text.split())
    
    return text

all_data['content_clean'] = all_data['content'].apply(clean_text)
all_data

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ozodbek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,date,content,content_clean
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...,mr duisenberg reports outcome second meeting g...
1,1998-09-18,Mr. Duisenberg’s opening statement at the meet...,mr duisenberg’s opening statement meeting gove...
2,1998-10-12,Mr. Duisenberg speaks on changes in European f...,mr duisenberg speaks changes european financia...
3,1998-10-12,Mr. Duisenberg's statement to the European Par...,mr duisenbergs statement european parliament’s...
4,1998-10-22,Mr. Duisenberg’s opening statement at the pres...,mr duisenberg’s opening statement press confer...
...,...,...,...
2846,8 May 2014,"At today’s meeting, which was held in Brussels...",today’s meeting held brussels governing counci...
2847,3 April 2014,At today’s meeting the Governing Council of th...,today’s meeting governing council ecb decided ...
2848,6 March 2014,At today’s meeting the Governing Council of th...,today’s meeting governing council ecb decided ...
2849,6 February 2014,At today’s meeting the Governing Council of th...,today’s meeting governing council ecb decided ...


In [15]:
import pandas as pd

def consolidate_lines(df):
    """
    Consolidate lines in the content of a dataframe.
    If a line does not end with a full stop, it is merged
    with the next line, removing unnecessary newline characters.

    Args:
        df (pandas.DataFrame): Dataframe with 'content' column

    Returns:
        pandas.DataFrame: Modified dataframe with consolidated lines
    """
    consolidated_data = []

    for index, row in df.iterrows():
        content = row['content']
        lines = content.split('\n')
        consolidated_content = ""
        for i, line in enumerate(lines):
            line = line.strip()
            if line and (line[-1] in ".!?" or i == len(lines) - 1):
                consolidated_content += line + " "  # Append a space after each line
            else:
                consolidated_content += line + " "  # Append a space to separate lines

        consolidated_row = row.copy()
        consolidated_row['content'] = consolidated_content.strip()
        consolidated_data.append(consolidated_row)

    # Create new dataframe with consolidated content
    new_df = pd.DataFrame(consolidated_data)

    return new_df



import re

def split_paragraphs(df):
    """
    Splits content of dataframe into separate rows. 
    If a paragraph exceeds 512 words, it is limited to 
    two nearly equal parts, with the split done at a full stop.
    
    Args:
        df (pandas.DataFrame): Dataframe with 'date' and 'content' columns
    
    Returns:
        pandas.DataFrame: Modified dataframe with split content
    """
    data = []

    for index, row in df.iterrows():
        date = row['date']
        content = row['content']

        # Split content into sentences
        sentences = re.split(r'(?<=[.!?])\s+', content)

        # Process each sentence
        current_part = ""
        for sentence in sentences:
            if len(current_part.split()) + len(sentence.split()) <= 250:
                current_part += " " + sentence
            else:
                if current_part.strip():
                    data.append([date, current_part.strip()])
                current_part = sentence

        # Append remaining part if exists
        if current_part.strip():
            data.append([date, current_part.strip()])

    # Create new dataframe
    new_df = pd.DataFrame(data, columns=['date', 'content'])

    return new_df

# clean the content column and remove stopwords
import string
import nltk
from nltk.corpus import stopwords

# Download stopwords if you haven't already
nltk.download('stopwords')

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Replace newline characters with space
    text = text.replace('\n', ' ')
    
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # Strip extra whitespaces
    text = ' '.join(text.split())
    
    return text

all_data = consolidate_lines(all_data)
all_data = split_paragraphs(all_data)
all_data['content_clean'] = all_data['content'].apply(clean_text)
all_data

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ozodbek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,date,content,content_clean
0,1998-07-17,Mr. Duisenberg reports on the outcome of the s...,mr duisenberg reports outcome second meeting g...
1,1998-07-17,As regards monetary and financial developments...,regards monetary financial developments euro a...
2,1998-07-17,I would not at the present time want to try to...,would present time want try convey strong impr...
3,1998-07-17,The committees are as follows (listed in alpha...,committees follows listed alphabetical order a...
4,1998-07-17,In the light of the potential burden which a m...,light potential burden minimum reserve system ...
...,...,...,...
28033,8 May 2014,"At today’s meeting, which was held in Brussels...",today’s meeting held brussels governing counci...
28034,3 April 2014,At today’s meeting the Governing Council of th...,today’s meeting governing council ecb decided ...
28035,6 March 2014,At today’s meeting the Governing Council of th...,today’s meeting governing council ecb decided ...
28036,6 February 2014,At today’s meeting the Governing Council of th...,today’s meeting governing council ecb decided ...


In [16]:
all_data.to_parquet('src/Output/all_data.parquet')


In [10]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


  return torch._C._cuda_getDeviceCount() > 0


In [11]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


2023-09-03 13:40:07.155878: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-03 13:40:07.198390: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  0


2023-09-03 13:40:08.868903: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_INVALID_VALUE: invalid argument
2023-09-03 13:40:08.873504: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: ozodbek-HP
2023-09-03 13:40:08.873553: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: ozodbek-HP
2023-09-03 13:40:08.873795: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 390.157.0
2023-09-03 13:40:08.873856: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.86.5
2023-09-03 13:40:08.873868: E tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:312] kernel version 535.86.5 does not match DSO version 390.157.0 -- cannot find working devices in this configuration
