In [1]:
from datasets import load_dataset 
dataset = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)

financial_phrasebank.py:   0%|          | 0.00/6.04k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.88k [00:00<?, ?B/s]

FinancialPhraseBank-v1.0.zip:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification

# 載入 BERT 預訓練模型與分詞器
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # 3 個標籤


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from sklearn.model_selection import train_test_split

# 將數據分割為 80% 訓練集和 20% 測試集
train_test_data = dataset["train"].train_test_split(test_size=0.2)


In [4]:
print(train_test_data)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1811
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 453
    })
})


In [5]:

# 分詞函數
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)

# 對數據進行分詞
encoded_dataset = train_test_data.map(tokenize_function, batched=True)

# 設置數據格式
encoded_dataset = encoded_dataset.rename_column("label", "labels")  # 轉換標籤欄位名稱
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/1811 [00:00<?, ? examples/s]

Map:   0%|          | 0/453 [00:00<?, ? examples/s]

# 設定訓練參數 fine-tuning

In [6]:
from transformers import Trainer, TrainingArguments

# 設置訓練參數
training_args = TrainingArguments(
    output_dir="./results",              # 模型輸出的目錄
    evaluation_strategy="epoch",        # 每個 epoch 評估一次
    learning_rate=2e-5,                 # 學習率
    per_device_train_batch_size=16,     # 每個設備的 batch size
    per_device_eval_batch_size=16,      # 驗證 batch size
    num_train_epochs=3,                 # 訓練的 epoch 數
    weight_decay=0.01,                  # 權重衰減
    save_strategy="epoch",              # 每個 epoch 保存模型
    logging_dir="./logs",               # 日誌保存路徑
    logging_steps=10,                   # 日誌輸出頻率
    report_to=[],  # 禁用 W&B
)

trainer = Trainer(
    model=model,                         # 微調的模型
    args=training_args,                  # 訓練參數
    train_dataset=encoded_dataset["train"],  # 訓練集
    eval_dataset=encoded_dataset["test"],    # 測試集
)

# 開始訓練
trainer.train()




Epoch,Training Loss,Validation Loss
1,0.4254,0.293433
2,0.0889,0.14102
3,0.0505,0.11226


TrainOutput(global_step=342, training_loss=0.2603304992642319, metrics={'train_runtime': 3890.6775, 'train_samples_per_second': 1.396, 'train_steps_per_second': 0.088, 'total_flos': 357373799629056.0, 'train_loss': 0.2603304992642319, 'epoch': 3.0})

In [7]:
model.save_pretrained("./finetuned_bert")
tokenizer.save_pretrained("./finetuned_bert")

('./finetuned_bert/tokenizer_config.json',
 './finetuned_bert/special_tokens_map.json',
 './finetuned_bert/vocab.txt',
 './finetuned_bert/added_tokens.json')

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
# 載入保存的微調模型
model = BertForSequenceClassification.from_pretrained("./finetuned_bert")
tokenizer = BertTokenizer.from_pretrained("./finetuned_bert")

# 測試分類

In [9]:
# Sentiment test data
test_data = [
    # Positive
    "The company announced record-breaking profits, exciting investors and stakeholders alike.",
    "Innovative product launches have strengthened the company's position in the market, boosting sales.",

    # Neutral
    "The quarterly report was released today, outlining the company's financial performance.",
    "The CEO attended a global summit to discuss industry trends but made no major announcements.",

    # Negative
    "Supply chain disruptions have delayed product deliveries, frustrating customers.",
    "The company's stock value dropped sharply after missing earnings forecasts."
]


In [10]:
import torch

def classify_text(texts, model, tokenizer):
    """
    使用微調的 BERT 模型對文本進行分類。
    :param texts: 一個包含待分類文本的列表
    :param model: 微調的 BERT 模型
    :param tokenizer: 與模型匹配的 Tokenizer
    :return: 每個文本的分類結果列表
    """
    model.eval()  # 切換到評估模式
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    
    return predictions.cpu().numpy()


In [11]:
# 分類測試數據
predictions = classify_text(test_data, model, tokenizer)

# 將分類結果對應到實際標籤名稱
label_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}  # 假設模型有這三個標籤
results = [label_mapping[pred] for pred in predictions]

# 輸出結果
for text, sentiment in zip(test_data, results):
    print(f"Text: {text}\nSentiment: {sentiment}\n")


Text: The company announced record-breaking profits, exciting investors and stakeholders alike.
Sentiment: Positive

Text: Innovative product launches have strengthened the company's position in the market, boosting sales.
Sentiment: Positive

Text: The quarterly report was released today, outlining the company's financial performance.
Sentiment: Neutral

Text: The CEO attended a global summit to discuss industry trends but made no major announcements.
Sentiment: Neutral

Text: Supply chain disruptions have delayed product deliveries, frustrating customers.
Sentiment: Negative

Text: The company's stock value dropped sharply after missing earnings forecasts.
Sentiment: Negative

