In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#把載入資料集改成roberta的格式

In [None]:
from datasets import load_dataset 
# 載入 Financial PhraseBank 資料集
dataset = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# 載入 RoBERTa 預訓練模型與分詞器
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)  # 3 個標籤


In [None]:
from sklearn.model_selection import train_test_split

# 將數據分割為 80% 訓練集和 20% 測試集
train_test_data = dataset["train"].train_test_split(test_size=0.2)

# 分別查看 train 和 test 的數據
print(train_test_data)


In [None]:
print(train_test_data)

In [None]:

# 分詞函數
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length", max_length=128)

# 對數據進行分詞
encoded_dataset = train_test_data.map(tokenize_function, batched=True)

# 設置數據格式
encoded_dataset = encoded_dataset.rename_column("label", "labels")  # 轉換標籤欄位名稱
encoded_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 設定訓練參數 fine-tuning

In [None]:
from transformers import Trainer, TrainingArguments

# 設置訓練參數
training_args = TrainingArguments(
    output_dir="./results",              # 模型輸出的目錄
    evaluation_strategy="epoch",        # 每個 epoch 評估一次
    learning_rate=2e-5,                 # 學習率
    per_device_train_batch_size=16,     # 每個設備的 batch size
    per_device_eval_batch_size=16,      # 驗證 batch size
    num_train_epochs=3,                 # 訓練的 epoch 數
    weight_decay=0.01,                  # 權重衰減
    save_strategy="epoch",              # 每個 epoch 保存模型
    logging_dir="./logs",               # 日誌保存路徑
    logging_steps=10,                   # 日誌輸出頻率
    report_to=[],  # 禁用 W&B
)

trainer = Trainer(
    model=model,                         # 微調的模型
    args=training_args,                  # 訓練參數
    train_dataset=encoded_dataset["train"],  # 訓練集
    eval_dataset=encoded_dataset["test"],    # 測試集
)

# 開始訓練
trainer.train()


In [None]:
model.save_pretrained("./finetuned_roberta")
tokenizer.save_pretrained("./finetuned_roberta")

In [None]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# 載入保存的微調模型
model = RobertaForSequenceClassification.from_pretrained("./finetuned_roberta")
tokenizer = RobertaTokenizer.from_pretrained("./finetuned_roberta")

# 測試分類

In [None]:
# Sentiment test data
test_data = [
    # Positive
    "The company's recent financial report exceeded market expectations.",
    "Investors are confident in this new technology, leading to a significant stock price increase.",
    "The management's decisive actions have resulted in substantial profits.",
    "This deal will significantly expand the company's market share.",
    "Customer feedback on the product has been overwhelmingly positive, driving sales upward.",

    # Neutral
    "The company plans to expand into new markets next year, but details are not yet disclosed.",
    "The stock market showed little movement today as investors remained cautious.",
    "The board meeting proceeded as scheduled, discussing future growth strategies.",
    "The management announced a change in CEO, with a calm market response.",
    "The company reported revenue growth in the quarterly report but fell short of its target.",

    # Negative
    "The product's market performance fell below expectations, leading to a decline in quarterly earnings.",
    "Investors were disappointed with this policy, causing the stock price to drop.",
    "The company is embroiled in a legal dispute and may face significant penalties.",
    "A competitor's new product has created substantial pressure on the company.",
    "Customer complaints about poor after-sales service have damaged the brand's reputation."
]


In [None]:
import torch

def classify_text(texts, model, tokenizer):
    """
    使用微調的 RoBERTa 模型對文本進行分類。
    :param texts: 一個包含待分類文本的列表
    :param model: 微調的 RoBERTa 模型
    :param tokenizer: 與模型匹配的 Tokenizer
    :return: 每個文本的分類結果列表
    """
    model.eval()  # 切換到評估模式
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    
    return predictions.cpu().numpy()


In [None]:
# 分類測試數據
predictions = classify_text(test_data, model, tokenizer)

# 將分類結果對應到實際標籤名稱
label_mapping = {0: "Negative", 1: "Neutral", 2: "Positive"}  # 假設模型有這三個標籤
results = [label_mapping[pred] for pred in predictions]

# 輸出結果
for text, sentiment in zip(test_data, results):
    print(f"Text: {text}\nSentiment: {sentiment}\n")
