In [1]:
import os
import pandas as pd
from finicialTextProcessor import FinancialTextProcessor
from finBertCustomDataset import CustomDataset
from finBERTClassifier import FinBERTClassifier
from finBertTrainer import FinBERTTrainer
import torch
from sklearn.model_selection import train_test_split
import re
import csv
import itertools
import numpy as np
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from contractions import fix
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
# warnings.filterwarnings('ignore') # setting ignore as a parameter

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hank\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hank\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hank\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 設置路徑
lm_dict_filepath = r'C:\Users\Hank\earnings-call-predict-stock-price-movement\preprocessing_code\Loughran-McDonald_MasterDictionary_1993-2023.csv'
dataset_filepath = r'C:\Users\Hank\earnings-call-predict-stock-price-movement\preprocessing_code\dataset\20240531_nasdaq_three_class_label.csv'

In [3]:
# 初始化文本處理器
processor = FinancialTextProcessor(lm_dict_filepath)

In [4]:
# 加載和預處理數據集
df = pd.read_csv(dataset_filepath)
denoised_df = processor.process_and_create_denoised_df(df)

Index: 2, len: 68, Paragraph: Harrison Masters--Director, Investor Relations and Strategic Finance
Index: 5, len: 0, Paragraph: 
Index: 6, len: 0, Paragraph: 
Index: 7, len: 50, Paragraph: Pierre Naude--Chairman and Chief Executive Officer
Index: 23, len: 77, Paragraph: With that, I'll turn the call over to Greg to take us through the financials.
Index: 24, len: 39, Paragraph: Greg Orenstein--Chief Financial Officer
Index: 36, len: 55, Paragraph: With that, operator, we'll open the line for questions.
Index: 37, len: 8, Paragraph: Operator
Index: 39, len: 40, Paragraph: Adam Hotchkiss--Goldman Sachs -- Analyst
Index: 41, len: 50, Paragraph: Pierre Naude--Chairman and Chief Executive Officer
Index: 46, len: 39, Paragraph: Greg Orenstein--Chief Financial Officer
Index: 48, len: 40, Paragraph: Adam Hotchkiss--Goldman Sachs -- Analyst
Index: 51, len: 39, Paragraph: Greg Orenstein--Chief Financial Officer
Index: 55, len: 40, Paragraph: Adam Hotchkiss--Goldman Sachs -- Analyst
Index: 56, len

In [7]:
denoised_df

Unnamed: 0,paragraphs,three_class_label
0,"Additionally, annual merit increases affected ...",0
1,One is that we -- and this is going back a yea...,-1
2,For risks that could cause actual results to b...,0
3,"5% of the code represents 99% of the run time,...",1
4,We are pleased to report top-line growth and c...,0
5,We introduced real-time intent recognition tec...,0
6,This includes total company revenue growth of ...,-1
7,Statements made on this call include forward-l...,0
8,"Before we get started, we want to emphasize th...",-1
9,"Also, on this page you'll be able to find a co...",0


In [8]:
# 建立一個映射字典
label_mapping = {-1: 0, 0: 1, 1: 2}

# 使用 map 函數進行轉換
denoised_df['three_class_label'] = denoised_df['three_class_label'].map(label_mapping)
denoised_df

Unnamed: 0,paragraphs,three_class_label
0,"Additionally, annual merit increases affected ...",1
1,One is that we -- and this is going back a yea...,0
2,For risks that could cause actual results to b...,1
3,"5% of the code represents 99% of the run time,...",2
4,We are pleased to report top-line growth and c...,1
5,We introduced real-time intent recognition tec...,1
6,This includes total company revenue growth of ...,0
7,Statements made on this call include forward-l...,1
8,"Before we get started, we want to emphasize th...",0
9,"Also, on this page you'll be able to find a co...",1


In [6]:
# 設置模型參數
args = {
    "config": "bert-base-uncased",
    "max_len": 512,
    "num_class": 3,
    "batch_size": 16,
    "epochs": 5,
    "learning_rate": 2e-5
}

In [7]:
# 分割数据集
train_df, temp_df = train_test_split(denoised_df, random_state=1111, train_size=0.8)
val_df, test_df = train_test_split(temp_df, random_state=1111, train_size=0.5)

In [8]:
# 創建自定義數據集和數據加載器
train_dataset = CustomDataset('train', train_df, 'paragraphs', args, method="first")
val_dataset = CustomDataset('val', val_df, 'paragraphs', args, method="first")
test_dataset = CustomDataset('test', test_df, 'paragraphs', args, method="first")

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args['batch_size'], shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=False)

In [None]:
# 初始化並訓練模型
trainer = FinBERTTrainer(args)
train_record = trainer.train(train_loader, val_loader)

# 評估模型
for metric in train_record:
    print(f"{metric}: {train_record[metric]}")

In [None]:
# 评估模型
val_loss, val_acc, val_f1, val_rec, val_prec = trainer.evaluate(val_loader)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}, Validation F1: {val_f1:.4f}, Validation Recall: {val_rec:.4f}, Validation Precision: {val_prec:.4f}")

In [None]:
# 預測測試集
total_probs, total_pred = trainer.predict(test_loader)
res = test_df.copy()
res['pred'] = total_pred
res