# 前置動作

In [None]:
pip install transformers datasets accelerate torch

In [None]:
pip install evaluate rouge-score bert-score

In [None]:
pip install sentencepiece

In [None]:
pip install peft==0.5.0

In [None]:
pip uninstall peft triton -y

數據前處理。分割訓練和驗證集

In [None]:
import json
from datasets import Dataset

# 加載 train.json
with open('train.json', 'r') as f:
    train_data = [json.loads(line) for line in f]
    
with open('test.json', 'r') as f:
    test_data = [json.loads(line) for line in f]

# 轉換為 Hugging Face Dataset 格式
train_dataset = Dataset.from_list(train_data)

# 將數據集分割為訓練集和驗證集（80% 訓練，20% 驗證）
split_dataset = train_dataset.train_test_split(test_size=0.2, seed=42)
train_split = split_dataset['train']  # 訓練集
valid_split = split_dataset['test']   # 驗證集

print(f"訓練集大小: {len(train_split)}")
print(f"驗證集大小: {len(valid_split)}")
print(f"測試集大小: {len(test_data)}")

In [None]:
import json
from transformers import PegasusTokenizer
import numpy as np

# 加載 tokenizer
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# 加載數據
with open('train.json', 'r') as f:
    train_data = [json.loads(line) for line in f]
    
with open('test.json', 'r') as f:
    test_data = [json.loads(line) for line in f]

# 函數：計算 token 長度
def get_token_lengths(data, field):
    lengths = [len(tokenizer.encode(item[field], add_special_tokens=True)) for item in data]
    return lengths

# 計算 train.json 中 introduction 和 abstract 的長度
train_intro_lengths = get_token_lengths(train_data, 'introduction')
train_abs_lengths = get_token_lengths(train_data, 'abstract')

# 計算 test.json 中 introduction 的長度
test_intro_lengths = get_token_lengths(test_data, 'introduction')

# 統計信息
def print_stats(lengths, name):
    print(f"{name} 長度統計：")
    print(f"平均長度: {np.mean(lengths):.1f}")
    print(f"中位數: {np.median(lengths):.1f}")
    print(f"最大長度: {max(lengths)}")
    print(f"最小長度: {min(lengths)}")
    print(f"90% 分位數: {np.percentile(lengths, 90):.1f}")
    print("---")

# 輸出結果
print_stats(train_intro_lengths, "Train Introduction")
print_stats(train_abs_lengths, "Train Abstract")
print_stats(test_intro_lengths, "Test Introduction")

from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model_name = "google/pegasus-x-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

# 分析 token 長度分佈
def analyze_token_lengths(data, tokenizer, field="introduction"):
    lengths = []
    for item in data:
        text = item[field]
        tokens = tokenizer.encode(text, add_special_tokens=True)
        lengths.append(len(tokens))
    return lengths

# 假設 train_data 和 test_data 已定義
# 分析 train 和 test 數據
train_intro_lengths = analyze_token_lengths(train_data, tokenizer, "introduction")
train_abs_lengths = analyze_token_lengths(train_data, tokenizer, "abstract")
test_intro_lengths = analyze_token_lengths(test_data, tokenizer, "introduction")

# 打印統計信息
import numpy as np

print("Train Introduction Token Lengths:")
print(f"Mean: {np.mean(train_intro_lengths):.2f}, Max: {max(train_intro_lengths)}, Min: {min(train_intro_lengths)}")
print(f"Percentage > 1024: {sum(l > 1024 for l in train_intro_lengths) / len(train_intro_lengths) * 100:.2f}%")

print("Train Abstract Token Lengths:")
print(f"Mean: {np.mean(train_abs_lengths):.2f}, Max: {max(train_abs_lengths)}, Min: {min(train_abs_lengths)}")
print(f"Percentage > 660: {sum(l > 660 for l in train_abs_lengths) / len(train_abs_lengths) * 100:.2f}%")

print("Test Introduction Token Lengths:")
print(f"Mean: {np.mean(test_intro_lengths):.2f}, Max: {max(test_intro_lengths)}, Min: {min(test_intro_lengths)}")
print(f"Percentage > 1024: {sum(l > 1024 for l in test_intro_lengths) / len(test_intro_lengths) * 100:.2f}%")

# 方向1、Traditional  Language Model

Pegasus、T5 (FLAN-T5)、或 BART 都是最常見的「摘要三巨頭」

打算使用 PEGASUS-Large、PEGASUS-ArXiv、LED 、LongT5

# Pegasus-x-large 

In [None]:
pip install spacy summa nlpaug nltk

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
pip install googletrans==4.0.0-rc1

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

設定超參數、評估指標、訓練階段

In [None]:
import json
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import evaluate
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import unicodedata

# 加載 spacy 模型
nlp = spacy.load("en_core_web_sm")

allowed_unicode = "∑∂∇∞θπ𝒟𝒫𝒩αβγδελμσφωℝ𝔽𝓛"
def is_allowed_char(c):
    return (
        ord(c) < 128 or
        c in allowed_unicode or
        "MATHEMATICAL" in unicodedata.name(c, "")
    )

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)  # 保留 \emph{} 內文
    text = ''.join(c if is_allowed_char(c) else ' ' for c in text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load the Pegasus-X model and tokenizer
model_name = "google/pegasus-x-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

# 2. Load the datasets with cleaning
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    # 清理 introduction 和 abstract
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

train_data = load_json("train.json")  
test_data = load_json("test.json")    

# 3. 從兩端截斷 + 語義重要性選擇
def truncate_from_ends_with_importance(introduction, tokenizer, max_length=1024):
    # 使用 spacy 分割句子
    doc = nlp(introduction)
    sentences = [sent.text for sent in doc.sents]

    # 如果句子數少於等於 1，直接分詞並截斷
    if len(sentences) <= 1:
        tokens = tokenizer(
            introduction,
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors="pt"
        )
        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()    

    # 計算每個句子的 TF-IDF 分數
    vectorizer = TfidfVectorizer(stop_words="english")
    try:
        tfidf_matrix = vectorizer.fit_transform(sentences)
        sentence_scores = tfidf_matrix.sum(axis=1).A1
    except ValueError:
        sentence_scores = np.arange(len(sentences), 0, -1)

    # 確定從兩端保留的比例（例如前 40% 和後 40%）
    num_sentences = len(sentences)
    num_end_sentences = max(1, int(num_sentences * 0.45))  # 至少保留 1 句
    start_sentences = sentences[:num_end_sentences]  # 開頭部分
    end_sentences = sentences[-num_end_sentences:]   # 結尾部分
    middle_sentences = sentences[num_end_sentences:-num_end_sentences]  # 中間部分
    middle_scores = sentence_scores[num_end_sentences:-num_end_sentences]

    # 計算開頭和結尾部分的 token 數
    start_tokens = tokenizer.encode(" ".join(start_sentences), add_special_tokens=False)
    end_tokens = tokenizer.encode(" ".join(end_sentences), add_special_tokens=False)
    current_token_count = len(start_tokens) + len(end_tokens)

    # 如果開頭和結尾已經超過 max_length，直接截斷
    if current_token_count >= max_length - 2:
        combined_text = " ".join(start_sentences + end_sentences)
        tokens = tokenizer(
            combined_text,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

    # 從中間部分選擇關鍵句
    selected_middle_sentences = []
    if middle_sentences:
        sorted_middle_indices = np.argsort(middle_scores)[::-1]
        for idx in sorted_middle_indices:
            sentence = middle_sentences[idx]
            tokens = tokenizer.encode(sentence, add_special_tokens=False)
            if current_token_count + len(tokens) <= max_length - 2:
                selected_middle_sentences.append(sentence)
                current_token_count += len(tokens)
            else:
                break

    truncated_introduction = " ".join(start_sentences + selected_middle_sentences + end_sentences)
    tokens = tokenizer(
        truncated_introduction,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

# 訓練數據集
class TrainPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024, max_target_length=660):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]
        abstract = item["abstract"]

        # 使用從兩端截斷 + 語義重要性選擇
        input_ids, attention_mask = truncate_from_ends_with_importance(introduction, self.tokenizer, self.max_input_length)

        # 分詞 abstract (target)
        targets = self.tokenizer(
            abstract,
            max_length=self.max_target_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        target_ids = targets["input_ids"].squeeze()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids,
            "paper_id": item["paper_id"],
            "abstract": abstract
        }

# 測試數據集
class TestPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]

        # 使用從兩端截斷 + 語義重要性選擇
        input_ids, attention_mask = truncate_from_ends_with_importance(introduction, self.tokenizer, self.max_input_length)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "paper_id": item["paper_id"]
        }

# 創建數據集和 DataLoader
train_dataset = TrainPaperDataset(train_data, tokenizer, max_input_length=1024, max_target_length=660)
test_dataset = TestPaperDataset(test_data, tokenizer, max_input_length=1024)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# 4. Fine-tune the model with checkpoint saving
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
num_epochs = 300

# 創建檢查點儲存目錄
checkpoint_dir = "pegasus-x-large_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

model.train()
best_loss = float('inf')  # 用於儲存最佳損失
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f"Average Loss: {avg_loss:.4f}")

    # 儲存檢查點
    if epoch % 10 == 0:
        checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch + 1}.pth")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }, checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")

    # 儲存最佳模型（根據損失）
    if avg_loss < best_loss:
        best_loss = avg_loss
        best_model_path = os.path.join(checkpoint_dir, "best_model.pth")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }, best_model_path)
        print(f"Best model saved at {best_model_path} with loss {best_loss:.4f}")

# 5. 後處理函數：清理生成的摘要
def clean_abstract(abstract):
    # 移除非英文字符
    abstract = clean_text(abstract)
    
    # 移除開頭重複的 "In"
    words = abstract.split()
    while len(words) > 1 and words[0] == "In" and words[1] == "In":
        words.pop(0)
    abstract = " ".join(words)

    # 移除重複的句子
    sentences = abstract.split(". ")
    seen_sentences = []
    for sentence in sentences:
        if sentence and sentence not in seen_sentences:
            seen_sentences.append(sentence)
    abstract = ". ".join(seen_sentences)
    if abstract and not abstract.endswith("."):
        abstract += "."

    return abstract

推理階段

In [None]:
import json
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import evaluate
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import unicodedata

# 加載 spacy 模型
nlp = spacy.load("en_core_web_sm")

# 創建檢查點儲存目錄
checkpoint_dir = "pegasus-x-large_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "google/pegasus-x-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

allowed_unicode = "∑∂∇∞θπ𝒟𝒫𝒩αβγδελμσφωℝ𝔽𝓛"
def is_allowed_char(c):
    return (
        ord(c) < 128 or
        c in allowed_unicode or
        "MATHEMATICAL" in unicodedata.name(c, "")
    )

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)  # 保留 \emph{} 內文
    text = ''.join(c if is_allowed_char(c) else ' ' for c in text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    # 清理 introduction 和 abstract
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

def clean_abstract(abstract):
    # 移除非英文字符
    abstract = clean_text(abstract)
    
    # 移除開頭重複的 "In"
    words = abstract.split()
    while len(words) > 1 and words[0] == "In" and words[1] == "In":
        words.pop(0)
    abstract = " ".join(words)

    # 移除重複的句子
    sentences = abstract.split(". ")
    seen_sentences = []
    for sentence in sentences:
        if sentence and sentence not in seen_sentences:
            seen_sentences.append(sentence)
    abstract = ". ".join(seen_sentences)
    if abstract and not abstract.endswith("."):
        abstract += "."

    return abstract

class TrainPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024, max_target_length=660):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]
        abstract = item["abstract"]

        # 使用從兩端截斷 + 語義重要性選擇
        input_ids, attention_mask = truncate_from_ends_with_importance(introduction, self.tokenizer, self.max_input_length)

        # 分詞 abstract (target)
        targets = self.tokenizer(
            abstract,
            max_length=self.max_target_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        target_ids = targets["input_ids"].squeeze()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids,
            "paper_id": item["paper_id"],
            "abstract": abstract
        }

# 測試數據集
class TestPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]

        # 使用從兩端截斷 + 語義重要性選擇
        input_ids, attention_mask = truncate_from_ends_with_importance(introduction, self.tokenizer, self.max_input_length)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "paper_id": item["paper_id"]
        }

def truncate_from_ends_with_importance(introduction, tokenizer, max_length=1024):
    # 使用 spacy 分割句子
    doc = nlp(introduction)
    sentences = [sent.text for sent in doc.sents]

    # 如果句子數少於等於 1，直接分詞並截斷
    if len(sentences) <= 1:
        tokens = tokenizer(
            introduction,
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors="pt"
        )
        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()    

    # 計算每個句子的 TF-IDF 分數
    vectorizer = TfidfVectorizer(stop_words="english")
    try:
        tfidf_matrix = vectorizer.fit_transform(sentences)
        sentence_scores = tfidf_matrix.sum(axis=1).A1
    except ValueError:
        sentence_scores = np.arange(len(sentences), 0, -1)

    # 確定從兩端保留的比例（例如前 40% 和後 40%）
    num_sentences = len(sentences)
    num_end_sentences = max(1, int(num_sentences * 0.45))  # 至少保留 1 句
    start_sentences = sentences[:num_end_sentences]  # 開頭部分
    end_sentences = sentences[-num_end_sentences:]   # 結尾部分
    middle_sentences = sentences[num_end_sentences:-num_end_sentences]  # 中間部分
    middle_scores = sentence_scores[num_end_sentences:-num_end_sentences]

    # 計算開頭和結尾部分的 token 數
    start_tokens = tokenizer.encode(" ".join(start_sentences), add_special_tokens=False)
    end_tokens = tokenizer.encode(" ".join(end_sentences), add_special_tokens=False)
    current_token_count = len(start_tokens) + len(end_tokens)

    # 如果開頭和結尾已經超過 max_length，直接截斷
    if current_token_count >= max_length - 2:
        combined_text = " ".join(start_sentences + end_sentences)
        tokens = tokenizer(
            combined_text,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

    # 從中間部分選擇關鍵句
    selected_middle_sentences = []
    if middle_sentences:
        sorted_middle_indices = np.argsort(middle_scores)[::-1]
        for idx in sorted_middle_indices:
            sentence = middle_sentences[idx]
            tokens = tokenizer.encode(sentence, add_special_tokens=False)
            if current_token_count + len(tokens) <= max_length - 2:
                selected_middle_sentences.append(sentence)
                current_token_count += len(tokens)
            else:
                break

    truncated_introduction = " ".join(start_sentences + selected_middle_sentences + end_sentences)
    tokens = tokenizer(
        truncated_introduction,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

train_data = load_json("train.json")  # 408 samples
test_data = load_json("test.json")    # 103 samples
train_dataset = TrainPaperDataset(train_data, tokenizer, max_input_length=1024, max_target_length=660)
test_dataset = TestPaperDataset(test_data, tokenizer, max_input_length=1024)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# 6. 推理時加載訓練好的模型
# 加載最佳模型（可以根據需要改為特定的 epoch 檢查點，例如 "checkpoint_epoch_50.pth"）
checkpoint_path = os.path.join(checkpoint_dir, "best_model.pth")
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded checkpoint from {checkpoint_path} (epoch {checkpoint['epoch']}, loss {checkpoint['loss']:.4f})")
else:
    print(f"Checkpoint {checkpoint_path} not found, using the last trained model.")

# 設置模型為評估模式
model.eval()
predictions = []
predicted_abstracts = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        paper_ids = batch["paper_id"]

        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=1024,
            min_length=50, 
            num_beams=8, 
            length_penalty=1.8,
            early_stopping=True,
            top_k=50,
            top_p=0.9
        )

        for i, generated in enumerate(generated_ids):
            abstract = tokenizer.decode(generated, skip_special_tokens=True)
            abstract = clean_abstract(abstract)
            if isinstance(paper_ids[i], torch.Tensor):
                paper_id = str(paper_ids[i].item())
            elif isinstance(paper_ids[i], np.ndarray):
                paper_id = str(paper_ids[i].item())
            else:
                paper_id = str(paper_ids[i])
            abstract = str(abstract) if not isinstance(abstract, str) else abstract
            predictions.append({
                "paper_id": paper_id,
                "abstract": abstract
            })
            predicted_abstracts.append(abstract)

# 7. 評估模型
reference_abstracts = [item["abstract"] for item in train_data[:103]]

metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bertscore = evaluate.load("bertscore")

rouge_scores = metric_rouge.compute(
    predictions=predicted_abstracts,
    references=reference_abstracts,
    use_stemmer=True
)

bert_scores = metric_bertscore.compute(
    predictions=predicted_abstracts,
    references=reference_abstracts,
    lang="en"
)

print("\n=== Evaluation Results ===")
print("ROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_scores['rougeL']:.4f}")

print("\nBERTScore:")
print(f"Precision: {np.mean(bert_scores['precision']):.4f}")
print(f"Recall: {np.mean(bert_scores['recall']):.4f}")
print(f"F1: {np.mean(bert_scores['f1']):.4f}")

# 8. Save predictions
with open("submission.json", "w") as f:
    for pred in predictions:
        f.write(json.dumps(pred) + "\n")

print("Predictions saved to submission.json")

# Pegasus-arxiv

In [None]:
pip install sentence-transformers

In [None]:
pip install sentencepiece

In [None]:
pip install rouge

微調訓練階段

In [None]:
import json
import torch
import re
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import spacy
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge
import evaluate

# 加載 spacy 模型
nlp = spacy.load("en_core_web_sm")

# 清理 LaTeX 和亂碼的函數
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
    text = re.sub(r'[^\x00-\x7F’‘–—∑∂∇∞θπ𝒟𝒫𝒩αβγδελμσφωℝ𝔽𝓛()]', ' ', text)  # 保留括號
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 設置設備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. 加載 Pegasus-ArXiv 模型和 tokenizer
model_name = "google/pegasus-arxiv"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

# 2. 加載數據集並清理文本
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

train_data = load_json("train.json")  # 408 個樣本
test_data = load_json("test.json")    # 103 個樣本

# 分割驗證集
val_size = int(0.2 * len(train_data))
val_data = train_data[-val_size:]
train_data = train_data[:-val_size]

print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")

# 3. 使用 sentence-transformers 改進截斷策略，引入領域知識
embedder = SentenceTransformer('all-MiniLM-L6-v2')

def select_important_sentences(introduction, tokenizer, max_length=1024):
    doc = nlp(introduction)
    sentences = [sent.text for sent in doc.sents]
    if len(sentences) <= 1:
        tokens = tokenizer(
            introduction,
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors="pt"
        )
        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

    # 計算句子嵌入
    embeddings = embedder.encode(sentences)
    doc_embedding = np.mean(embeddings, axis=0)
    similarities = cosine_similarity(embeddings, doc_embedding.reshape(1, -1)).flatten()

    # 引入領域知識：定義學術文章中常見的關鍵詞
    academic_keywords = [
        "propose", "method", "approach", "result", "finding", "conclusion",
        "demonstrate", "show", "achieve", "contribution", "investigate", "study",
        "analysis", "evaluate", "performance", "improve", "novel", "framework"
    ]

    # 給包含關鍵詞的句子加權
    scores = similarities.copy()
    for idx, sentence in enumerate(sentences):
        if any(keyword in sentence.lower() for keyword in academic_keywords):
            scores[idx] *= 1.5  # 提高包含關鍵詞的句子的分數

    sorted_indices = np.argsort(scores)[::-1]

    selected_sentences = []
    current_length = 0
    for idx in sorted_indices:
        sentence = sentences[idx]
        tokens = tokenizer.encode(sentence, add_special_tokens=False)
        if current_length + len(tokens) <= max_length - 2:
            selected_sentences.append(sentence)
            current_length += len(tokens)
        else:
            break

    truncated_text = " ".join(selected_sentences)
    tokens = tokenizer(
        truncated_text,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
    return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

# 4. 定義數據集
class TrainPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024, max_target_length=660):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]
        abstract = item["abstract"]

        input_ids, attention_mask = select_important_sentences(introduction, self.tokenizer, self.max_input_length)

        targets = self.tokenizer(
            abstract,
            max_length=self.max_target_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        target_ids = targets["input_ids"].squeeze()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids,
            "paper_id": item["paper_id"],
            "abstract": abstract
        }

class TestPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]
        input_ids, attention_mask = select_important_sentences(introduction, self.tokenizer, self.max_input_length)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "paper_id": item["paper_id"]
        }

# 創建數據集和 DataLoader
train_dataset = TrainPaperDataset(train_data, tokenizer, max_input_length=1024, max_target_length=660)
val_dataset = TrainPaperDataset(val_data, tokenizer, max_input_length=1024, max_target_length=660)
test_dataset = TestPaperDataset(test_data, tokenizer, max_input_length=1024)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# 5. 定義 ROUGE 和 BERTScore 損失計算函數
rouge = Rouge()
bertscore_metric = evaluate.load("bertscore")

def compute_rouge_loss(model, input_ids, attention_mask, reference_abstracts):
    model.eval()
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=660,
            num_beams=5,
            early_stopping=True
        )
        generated_abstracts = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
        scores = rouge.get_scores(generated_abstracts, reference_abstracts, avg=True)
        rouge_l = scores['rouge-l']['f']
    model.train()
    return rouge_l

def compute_bertscore_loss(model, input_ids, attention_mask, reference_abstracts):
    model.eval()
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=660,
            num_beams=5,
            early_stopping=True
        )
        generated_abstracts = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
        scores = bertscore_metric.compute(
            predictions=generated_abstracts,
            references=reference_abstracts,
            lang="en"
        )
        bertscore_f1 = np.mean(scores['f1'])
    model.train()
    return bertscore_f1

# 新增 ROUGE-1 計算函數
def compute_rouge_scores(model, input_ids, attention_mask, reference_abstracts):
    model.eval()
    with torch.no_grad():
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=660, num_beams=5)
        generated_abstracts = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_ids]
        scores = rouge.get_scores(generated_abstracts, reference_abstracts, avg=True)
        rouge1 = scores['rouge-1']['f']
        rougeL = scores['rouge-l']['f']
    model.train()
    return rouge1, rougeL

# 6. 訓練模型：優化早停和學習率調度，並添加評估
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.05)  # 增加初始學習率

# 定義溫暖啟動調度器
def warm_up_lambda(epoch):
    warm_up_epochs = 10  # 增加溫暖啟動階段
    if epoch < warm_up_epochs:
        return (epoch + 1) / warm_up_epochs  # 線性增加學習率
    return 1.0

def train_segmented_input(model, input_ids, attention_mask, labels, max_segment_length=1024):
    total_length = input_ids.shape[1]
    if total_length <= max_segment_length:
        return model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    losses = []
    for start in range(0, total_length, max_segment_length):
        end = min(start + max_segment_length, total_length)
        segment_input_ids = input_ids[:, start:end]
        segment_attention_mask = attention_mask[:, start:end]
        segment_labels = labels[:, start:end] if labels.shape[1] == total_length else labels
        output = model(input_ids=segment_input_ids, attention_mask=segment_attention_mask, labels=segment_labels)
        losses.append(output.loss)
    return torch.mean(torch.stack(losses))

warm_up_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=warm_up_lambda)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=290, eta_min=1e-6)

num_epochs = 50  # 增加總訓練 epoch 數
accumulation_steps = 5

checkpoint_dir = "pegasus-arxiv_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

best_val_loss = float('inf')

# Baseline 分數（用於比較）
baseline_scores = {
    "rouge1": 0.47,
    "rouge2": 0.12,
    "rougeL": 0.22,
    "bertscore_f1": 0.85
}

eval_frequency = 5  # 每 5 個 epoch 進行一次評估

# 儲存最佳 ROUGE-1 分數（用於早停）
best_rouge1 = 0.0
patience = 100
patience_counter = 0

# 儲存最佳評估分數（用於最終報告）
best_rouge2 = 0.0
best_rougeL = 0.0
best_bertscore_f1 = 0.0

model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0
    total_ce_loss = 0  # 記錄交叉熵損失
    optimizer.zero_grad()
    for i, batch in enumerate(tqdm(train_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        reference_abstracts = batch["abstract"]

        outputs = train_segmented_input(model, input_ids, attention_mask, labels)
        ce_loss = outputs.loss  # 提取 loss 屬性
        total_ce_loss += ce_loss.item()  # 從張量中獲取標量值

        # 每 5 個 batch 計算 ROUGE 損失（增加頻率）
        if i % 5 == 0:
            rouge1, rougeL = compute_rouge_scores(model, input_ids, attention_mask, reference_abstracts)
            reward_rouge1 = torch.tensor(rouge1, device=device)
            reward_rougeL = torch.tensor(rougeL, device=device)
        else:
            reward_rouge1 = 0.0
            reward_rougeL = 0.0

        # 每 10 個 batch 計算 BERTScore 損失（增加頻率）
        if i % 10 == 0:
            bertscore_f1 = compute_bertscore_loss(model, input_ids, attention_mask, reference_abstracts)
            reward_bertscore = torch.tensor(bertscore_f1, device=device)
        else:
            reward_bertscore = 0.0
        
        loss = ce_loss - 0.3 * reward_rouge1 - 0.1 * reward_rougeL - 0.05 * reward_bertscore
        total_loss += loss.item()

        loss = loss / accumulation_steps
        loss.backward()

        # 梯度裁剪，防止梯度爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

    if (i + 1) % accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    avg_ce_loss = total_ce_loss / len(train_loader)
    print(f"Training Loss: {avg_loss:.4f}, CE Loss: {avg_ce_loss:.4f}")

    # 驗證階段：計算損失並選擇性進行評估
    model.eval()
    val_loss = 0
    val_ce_loss = 0
    predicted_abstracts = []
    reference_abstracts = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            reference_abstract = batch["abstract"]

            # 計算驗證損失
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_ce_loss += outputs.loss.item()

            # 僅在需要評估時生成摘要
            if (epoch + 1) % eval_frequency == 0:
                generated_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=660,
                    min_length=50,
                    num_beams=15,
                    length_penalty=1.5,
                    repetition_penalty=1.2,
                    early_stopping=True,
                    no_repeat_ngram_size=3
                )
                generated_abstract = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
                predicted_abstracts.append(generated_abstract)
                reference_abstracts.append(reference_abstract[0])

    val_ce_loss /= len(val_loader)
    val_loss = val_ce_loss
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Current Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")

    # 每隔 eval_frequency 個 epoch 進行評估
    current_rouge1 = 0.0  # 預設值，確保早停邏輯正常運行
    if (epoch + 1) % eval_frequency == 0:
        # 計算 ROUGE 和 BERTScore
        metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"])
        metric_bertscore = evaluate.load("bertscore")

        rouge_scores = metric_rouge.compute(
            predictions=predicted_abstracts,
            references=reference_abstracts,
            use_stemmer=True
        )

        bert_scores = metric_bertscore.compute(
            predictions=predicted_abstracts,
            references=reference_abstracts,
            lang="en"
        )

        # 提取評估分數
        current_rouge1 = rouge_scores['rouge1']
        current_rouge2 = rouge_scores['rouge2']
        current_rougeL = rouge_scores['rougeL']
        current_bertscore_f1 = np.mean(bert_scores['f1'])

        # 打印評估結果並與 baseline 比較
        print("\n=== Validation Evaluation Results ===")
        print("ROUGE Scores:")
        print(f"ROUGE-1: {current_rouge1:.4f} (Baseline: {baseline_scores['rouge1']:.4f}, Diff: {current_rouge1 - baseline_scores['rouge1']:.4f})")
        print(f"ROUGE-2: {current_rouge2:.4f} (Baseline: {baseline_scores['rouge2']:.4f}, Diff: {current_rouge2 - baseline_scores['rouge2']:.4f})")
        print(f"ROUGE-L: {current_rougeL:.4f} (Baseline: {baseline_scores['rougeL']:.4f}, Diff: {current_rougeL - baseline_scores['rougeL']:.4f})")
        print("\nBERTScore:")
        print(f"Precision: {np.mean(bert_scores['precision']):.4f}")
        print(f"Recall: {np.mean(bert_scores['recall']):.4f}")
        print(f"F1: {current_bertscore_f1:.4f} (Baseline: {baseline_scores['bertscore_f1']:.4f}, Diff: {current_bertscore_f1 - baseline_scores['bertscore_f1']:.4f})")

    # 溫暖啟動調度
    warm_up_scheduler.step()
    # 動態調整學習率
    scheduler.step(val_loss)

    # 保存最佳模型（基於 ROUGE-1）
    if current_rouge1 > best_rouge1:
        best_rouge1 = current_rouge1
        patience_counter = 0
        best_model_path = os.path.join(checkpoint_dir, "best_model.pth")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
            'val_loss': val_loss,
        }, best_model_path)
        print(f"Best model saved at {best_model_path} with ROUGE-1 {best_rouge1:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1} due to no improvement in ROUGE-1")
            break

    # 保存最佳評估分數（基於 ROUGE-L 和 BERTScore F1）
    if (epoch + 1) % eval_frequency == 0:
        if current_rougeL > best_rougeL or current_bertscore_f1 > best_bertscore_f1:
            best_rouge2 = max(best_rouge2, current_rouge2)
            best_rougeL = max(best_rougeL, current_rougeL)
            best_bertscore_f1 = max(best_bertscore_f1, current_bertscore_f1)
            print(f"New best evaluation scores: ROUGE-L = {best_rougeL:.4f}, BERTScore F1 = {best_bertscore_f1:.4f}")

    if epoch % 50 == 0:
        checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch + 1}.pth")
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
        }, checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")

    torch.cuda.empty_cache()

# 打印最終最佳評估分數
print("\n=== Final Best Evaluation Scores ===")
print(f"Best ROUGE-1: {best_rouge1:.4f} (Baseline: {baseline_scores['rouge1']:.4f})")
print(f"Best ROUGE-2: {best_rouge2:.4f} (Baseline: {baseline_scores['rouge2']:.4f})")
print(f"Best ROUGE-L: {best_rougeL:.4f} (Baseline: {baseline_scores['rougeL']:.4f})")
print(f"Best BERTScore F1: {best_bertscore_f1:.4f} (Baseline: {baseline_scores['bertscore_f1']:.4f})")

推理階段(後處理可加BART輔助)

In [None]:
import json
import torch
import re
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, BartForConditionalGeneration, BartTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import spacy
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# 加載 spacy 模型
nlp = spacy.load("en_core_web_sm")

# 清理 LaTeX 和亂碼的函數
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
    text = re.sub(r'[^\x00-\x7F’‘–—∑∂∇∞θπ𝒟𝒫𝒩αβγδελμσφωℝ𝔽𝓛()]', ' ', text)  # 保留括號
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 設置設備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. 加載 Pegasus-ArXiv 模型和 tokenizer
model_name = "google/pegasus-arxiv"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

# 2. 加載數據集並清理文本
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

test_data = load_json("test.json")  # 103 個樣本

# 3. 使用 sentence-transformers 改進截斷策略，引入領域知識
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def select_important_sentences(introduction, tokenizer, max_length=1024):
    doc = nlp(introduction)
    sentences = [sent.text for sent in doc.sents]
    
    sentences = [
        s for s in sentences
        if not re.match(r'^\[\d+\]$', s.strip()) and         # 過濾 [12]
        not re.search(r'^\s*(features|methods|results)?\s*\d{4}\s*$', s.strip(), re.IGNORECASE) and  # 過濾 "features 2021"
        len(s.strip().split()) > 3  # 長度太短通常沒資訊
    ]
        
    if len(sentences) <= 1:
        tokens = tokenizer(introduction, truncation=True, max_length=max_length, padding="max_length", return_tensors="pt")
        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

    embeddings = embedder.encode(sentences)
    doc_embedding = np.mean(embeddings, axis=0)
    similarities = cosine_similarity(embeddings, doc_embedding.reshape(1, -1)).flatten()

    academic_keywords = ["propose", "method", "approach", "result", "finding", "conclusion",
                        "demonstrate", "show", "achieve", "contribution", "investigate", "study",
                        "analysis", "evaluate", "performance", "improve", "novel", "framework"]
    scores = similarities.copy()
    for idx, sentence in enumerate(sentences):
        if any(keyword in sentence.lower() for keyword in academic_keywords):
            scores[idx] *= 1.5

    sorted_indices = np.argsort(scores)[::-1]
    selected_sentences = []
    current_length = 0
    for idx in sorted_indices:
        sentence = sentences[idx]
        tokens = tokenizer.encode(sentence, add_special_tokens=False)
        if current_length + len(tokens) <= max_length - 2:
            selected_sentences.append(sentence)
            current_length += len(tokens)
        else:
            break

    truncated_text = " ".join(selected_sentences)
    tokens = tokenizer(truncated_text, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
    return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

# 4. 定義測試數據集（用於推理）
class TestPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]
        input_ids, attention_mask = select_important_sentences(introduction, self.tokenizer, self.max_input_length)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "paper_id": item["paper_id"],
            "introduction": introduction  # 保存原始引言以供後處理使用
        }

# 創建測試 DataLoader
test_dataset = TestPaperDataset(test_data, tokenizer, max_input_length=1024)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# 5. 後處理函數：基礎清理和關鍵詞保留
def clean_abstract(abstract):
    abstract = clean_text(abstract)
    words = abstract.split()
    while len(words) > 1 and words[0] == "In" and words[1] == "In":
        words.pop(0)
    abstract = " ".join(words)
    sentences = abstract.split(". ")
    seen_sentences = set()
    cleaned_sentences = []
    for sentence in sentences:
        if sentence and sentence not in seen_sentences:
            seen_sentences.add(sentence)
            cleaned_sentences.append(sentence)
    abstract = ". ".join(cleaned_sentences)
    if abstract and not abstract.endswith("."):
        abstract += "."
    return abstract

def extract_keywords(text, top_n=10):
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]
    keyword_indices = scores.argsort()[-top_n:][::-1]
    keywords = [feature_names[idx] for idx in keyword_indices]
    return keywords

def ensure_keywords_in_abstract(abstract, original_text):
    keywords = extract_keywords(original_text, top_n=10)
    abstract_words = abstract.split()
    if len(abstract_words) > 660:
        abstract = " ".join(abstract_words[:660])
    abstract_lower = abstract.lower()
    for keyword in keywords:
        if keyword.lower() not in abstract_lower:
            abstract += f" {keyword}"
    return abstract

# 6. 使用 BART 進行後處理
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large").to(device)

def enhance_abstract_with_bart(abstract):
    inputs = bart_tokenizer(abstract, return_tensors="pt", max_length=660, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    outputs = bart_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=660,
        min_length=100,
        num_beams=12,
        length_penalty=1.0,
        repetition_penalty=1.2,
        early_stopping=True,
    )
    enhanced = bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_abstract(enhanced)

# 7. 推理階段：分段生成（針對 test_data）
checkpoint_dir = "pegasus-arxiv_checkpoints"
checkpoint_path = os.path.join(checkpoint_dir, "best_model.pth")
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded checkpoint from {checkpoint_path} (epoch {checkpoint['epoch']}, loss {checkpoint['loss']:.4f})")
else:
    print(f"Checkpoint {checkpoint_path} not found, using the last trained model.")

# 設置模型為評估模式
model.eval()
predictions = []

def generate_segmented_abstract(input_ids, attention_mask, segment_length=1024, overlap=128):
    total_length = input_ids.shape[1]
    if total_length <= segment_length:
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=660,
            min_length=50,
            num_beams=6,  
            repetition_penalty=1.0,  
            length_penalty=1.5,  
            early_stopping=False
        )
        return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    segments = []
    start = 0
    while start < total_length:
        end = min(start + segment_length, total_length)
        segment_input_ids = input_ids[:, start:end]
        segment_attention_mask = attention_mask[:, start:end]
        
        if segment_input_ids.shape[1] == 0:
            break

        generated_ids = model.generate(
            input_ids=segment_input_ids,
            attention_mask=segment_attention_mask,
            max_length=660,
            min_length=50,
            num_beams=6,
            repetition_penalty=1.0,
            length_penalty=1.5,
            early_stopping=False
        )
        segments.append(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

        # 移動下一段，帶有重疊區
        start = end - overlap

    return " ".join(segments)


test_paper_ids = [item["paper_id"] for item in test_data] 
prediction_dict = {}  

with torch.no_grad():
    for batch_idx, batch in enumerate(tqdm(test_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        paper_ids = batch["paper_id"]
        introductions = batch["introduction"]

        for i in range(len(paper_ids)):
            # 分段生成
            abstract = generate_segmented_abstract(input_ids[i:i+1], attention_mask[i:i+1], segment_length=1024)
            # 基礎清理
            abstract = clean_abstract(abstract)
            # 使用 BART 後處理
            abstract = enhance_abstract_with_bart(abstract)
            # 確保關鍵詞保留
            abstract = ensure_keywords_in_abstract(abstract, introductions[i])

            # 獲取當前 paper_id
            paper_id = paper_ids[i]
            if isinstance(paper_id, torch.Tensor):
                paper_id = str(paper_id.item())
            elif isinstance(paper_id, np.ndarray):
                paper_id = str(paper_id.item())
            else:
                paper_id = str(paper_id)

            # 存儲預測結果
            abstract = str(abstract) if not isinstance(abstract, str) else abstract
            prediction_dict[paper_id] = abstract

# 按照 test.json 的 paper_id 順序生成 predictions
predictions = []
for paper_id in test_paper_ids:
    paper_id_str = str(paper_id)
    if paper_id_str in prediction_dict:
        predictions.append({
            "paper_id": paper_id_str,
            "abstract": prediction_dict[paper_id_str]
        })
    else:
        predictions.append({
            "paper_id": paper_id_str,
            "abstract": ""
        })

# 9. 保存預測結果
with open("submission_arxiv.json", "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(json.dumps(pred, ensure_ascii=False) + "\n")

print(f"Predictions saved to submission_arxiv.json, total predictions: {len(predictions)}")

In [None]:
import json
import torch
import re
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, BartForConditionalGeneration, BartTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import spacy
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# 加載 spacy 模型
nlp = spacy.load("en_core_web_sm")

# 清理 LaTeX 和亂碼的函數
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)
    text = re.sub(r'[^\x00-\x7F’‘–—∑∂∇∞θπ𝒟𝒫𝒩αβγδελμσφωℝ𝔽𝓛()]', ' ', text)  # 保留括號
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 設置設備
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. 加載 Pegasus-ArXiv 模型和 tokenizer
model_name = "google/pegasus-arxiv"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

# 2. 加載數據集並清理文本
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

test_data = load_json("test.json")  # 103 個樣本

# 3. 使用 sentence-transformers 改進截斷策略，引入領域知識
embedder = SentenceTransformer("all-MiniLM-L6-v2")

def select_important_sentences(introduction, tokenizer, max_length=1024):
    doc = nlp(introduction)
    sentences = [sent.text for sent in doc.sents]
    if len(sentences) <= 1:
        tokens = tokenizer(introduction, truncation=True, max_length=max_length, padding="max_length", return_tensors="pt")
        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

    embeddings = embedder.encode(sentences)
    doc_embedding = np.mean(embeddings, axis=0)
    similarities = cosine_similarity(embeddings, doc_embedding.reshape(1, -1)).flatten()

    academic_keywords = ["propose", "method", "approach", "result", "finding", "conclusion",
                        "demonstrate", "show", "achieve", "contribution", "investigate", "study",
                        "analysis", "evaluate", "performance", "improve", "novel", "framework"]
    scores = similarities.copy()
    for idx, sentence in enumerate(sentences):
        if any(keyword in sentence.lower() for keyword in academic_keywords):
            scores[idx] *= 1.5

    sorted_indices = np.argsort(scores)[::-1]
    selected_sentences = []
    current_length = 0
    for idx in sorted_indices:
        sentence = sentences[idx]
        tokens = tokenizer.encode(sentence, add_special_tokens=False)
        if current_length + len(tokens) <= max_length - 2:
            selected_sentences.append(sentence)
            current_length += len(tokens)
        else:
            break

    truncated_text = " ".join(selected_sentences)
    tokens = tokenizer(truncated_text, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
    return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze()

# 4. 定義測試數據集（用於推理）
class TestPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]
        input_ids, attention_mask = select_important_sentences(introduction, self.tokenizer, self.max_input_length)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "paper_id": item["paper_id"],
            "introduction": introduction  # 保存原始引言以供後處理使用
        }

# 創建測試 DataLoader
test_dataset = TestPaperDataset(test_data, tokenizer, max_input_length=1024)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# 5. 後處理函數：基礎清理和關鍵詞保留
def clean_abstract(abstract):
    abstract = clean_text(abstract)
    words = abstract.split()
    while len(words) > 1 and words[0] == "In" and words[1] == "In":
        words.pop(0)
    abstract = " ".join(words)
    sentences = abstract.split(". ")
    seen_sentences = set()
    cleaned_sentences = []
    for sentence in sentences:
        if sentence and sentence not in seen_sentences:
            seen_sentences.add(sentence)
            cleaned_sentences.append(sentence)
    abstract = ". ".join(cleaned_sentences)
    if abstract and not abstract.endswith("."):
        abstract += "."
    return abstract

def extract_keywords(text, top_n=10):
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform([text])
    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.toarray()[0]
    keyword_indices = scores.argsort()[-top_n:][::-1]
    keywords = [feature_names[idx] for idx in keyword_indices]
    return keywords

def ensure_keywords_in_abstract(abstract, original_text):
    keywords = extract_keywords(original_text, top_n=10)
    abstract_words = abstract.split()
    if len(abstract_words) > 660:
        abstract = " ".join(abstract_words[:660])
    abstract_lower = abstract.lower()
    for keyword in keywords:
        if keyword.lower() not in abstract_lower:
            abstract += f" {keyword}"
    return abstract

# 6. 使用 BART 進行後處理
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large").to(device)

def enhance_abstract_with_bart(abstract):
    inputs = bart_tokenizer(abstract, return_tensors="pt", max_length=660, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)
    outputs = bart_model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=660,
        min_length=100,
        num_beams=12,
        length_penalty=1.0,
        repetition_penalty=1.2,
        early_stopping=True,
    )
    enhanced = bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_abstract(enhanced)

# 7. 推理階段：分段生成（針對 test_data）
checkpoint_dir = "pegasus-arxiv_checkpoints"
checkpoint_path = os.path.join(checkpoint_dir, "best_model.pth")
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    print(f"Loaded checkpoint from {checkpoint_path} (epoch {checkpoint['epoch']}, loss {checkpoint['loss']:.4f})")
else:
    print(f"Checkpoint {checkpoint_path} not found, using the last trained model.")

# 設置模型為評估模式
model.eval()
predictions = []

def generate_segmented_abstract(input_ids, attention_mask, segment_length=1024, overlap=128):
    total_length = input_ids.shape[1]
    if total_length <= segment_length:
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=660,
            min_length=50,
            num_beams=6,  
            repetition_penalty=1.0,  
            length_penalty=1.5,  
            early_stopping=False
        )
        return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    segments = []
    start = 0
    while start < total_length:
        end = min(start + segment_length, total_length)
        segment_input_ids = input_ids[:, start:end]
        segment_attention_mask = attention_mask[:, start:end]
        
        if segment_input_ids.shape[1] == 0:
            break

        generated_ids = model.generate(
            input_ids=segment_input_ids,
            attention_mask=segment_attention_mask,
            max_length=660,
            min_length=50,
            num_beams=6,
            repetition_penalty=1.0,
            length_penalty=1.5,
            early_stopping=False
        )
        segments.append(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

        # 移動下一段，帶有重疊區
        start = end - overlap

    return " ".join(segments)


test_paper_ids = [item["paper_id"] for item in test_data] 
prediction_dict = {}  

with torch.no_grad():
    for batch_idx, batch in enumerate(tqdm(test_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        paper_ids = batch["paper_id"]
        introductions = batch["introduction"]

        for i in range(len(paper_ids)):
            # 分段生成
            abstract = generate_segmented_abstract(input_ids[i:i+1], attention_mask[i:i+1], segment_length=1024)
            # 基礎清理
            abstract = clean_abstract(abstract)
            # 使用 BART 後處理
            abstract = enhance_abstract_with_bart(abstract)
            # 確保關鍵詞保留
            abstract = ensure_keywords_in_abstract(abstract, introductions[i])

            # 獲取當前 paper_id
            paper_id = paper_ids[i]
            if isinstance(paper_id, torch.Tensor):
                paper_id = str(paper_id.item())
            elif isinstance(paper_id, np.ndarray):
                paper_id = str(paper_id.item())
            else:
                paper_id = str(paper_id)

            # 存儲預測結果
            abstract = str(abstract) if not isinstance(abstract, str) else abstract
            prediction_dict[paper_id] = abstract

# 按照 test.json 的 paper_id 順序生成 predictions
predictions = []
for paper_id in test_paper_ids:
    paper_id_str = str(paper_id)
    if paper_id_str in prediction_dict:
        predictions.append({
            "paper_id": paper_id_str,
            "abstract": prediction_dict[paper_id_str]
        })
    else:
        predictions.append({
            "paper_id": paper_id_str,
            "abstract": ""
        })

# 9. 保存預測結果
with open("submission_arxiv.json", "w", encoding="utf-8") as f:
    for pred in predictions:
        f.write(json.dumps(pred, ensure_ascii=False) + "\n")

print(f"Predictions saved to submission_arxiv.json, total predictions: {len(predictions)}")

# LED

微調訓練階段

In [None]:
import json
import torch
import re
import os
import numpy as np
import spacy
from torch.utils.data import Dataset, DataLoader
from transformers import LEDForConditionalGeneration, LEDTokenizer
from torch.nn import CrossEntropyLoss
from peft import LoraConfig, get_peft_model
from torch.cuda.amp import autocast, GradScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import evaluate

# Load spacy model
nlp = spacy.load("en_core_web_sm")

# Clean LaTeX and noise
def clean_text(text):
    text = re.sub(r'\\[\w]+\{.*?\}', '', text)
    text = re.sub(r'\$\$.*?\$\$', '', text)
    text = re.sub(r'\$.*?\$', '', text)
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\[\d+,\s*\d+\]', '', text)
    text = re.sub(r'Fig\.\s*\d+.*?(?=\.\s|$)', '', text)
    text = re.sub(r'Table\s*\d+.*?(?=\.\s|$)', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?()-]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load LED base model and tokenizer
model_name = "allenai/led-base-16384"
tokenizer = LEDTokenizer.from_pretrained(model_name)
model = LEDForConditionalGeneration.from_pretrained(model_name)

# Apply LoRA
lora_config = LoraConfig(
    r=128,
    lora_alpha=128,
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
model = model.to(device)
model.print_trainable_parameters()

# Load dataset
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

train_data = load_json("train.json")
test_data = load_json("test.json")
val_size = int(0.2 * len(train_data))
val_data = train_data[-val_size:]
train_data = train_data[:-val_size]

# Truncation function
def truncate_from_ends_with_importance(introduction, abstract=None, tokenizer=tokenizer, max_model_length=16384, attention_window=1024):
    tokens = tokenizer.encode(introduction, add_special_tokens=True)
    token_length = len(tokens)
    min_length = 128
    max_input_length = min(max_model_length, max(min_length, token_length))
    max_input_length = ((max_input_length + attention_window - 1) // attention_window) * attention_window

    doc = nlp(introduction)
    sentences = [sent.text for sent in doc.sents]
    if len(sentences) <= 1:
        tokens = tokenizer(introduction, truncation=True, max_length=max_input_length, padding="max_length", return_tensors="pt")
        return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze(), max_input_length

    num_sentences = len(sentences)
    proportion = 0.3 if 1024 <= token_length <= 4096 else (0.4 if token_length < 1024 else 0.2)
    num_end_sentences = max(1, int(num_sentences * proportion))
    start_sentences = sentences[:num_end_sentences]
    end_sentences = sentences[-num_end_sentences:]
    middle_sentences = sentences[num_end_sentences:-num_end_sentences]

    start_tokens = tokenizer.encode(" ".join(start_sentences), add_special_tokens=False)
    end_tokens = tokenizer.encode(" ".join(end_sentences), add_special_tokens=False)
    current_token_count = len(start_tokens) + len(end_tokens)

    selected_middle_sentences = []
    if middle_sentences and abstract:
        abstract_words = set(abstract.lower().split())
        sentence_scores = []
        for sent in middle_sentences:
            sent_words = set(sent.lower().split())
            overlap = len(sent_words & abstract_words) / len(sent_words) if sent_words else 0
            sentence_scores.append(overlap)
        sorted_middle_indices = np.argsort(sentence_scores)[::-1]
        for idx in sorted_middle_indices:
            sentence = middle_sentences[idx]
            tokens = tokenizer.encode(sentence, add_special_tokens=False)
            if current_token_count + len(tokens) <= max_input_length - 2:
                selected_middle_sentences.append(sentence)
                current_token_count += len(tokens)
            else:
                break
    elif middle_sentences:
        vectorizer = TfidfVectorizer(stop_words="english")
        tfidf_matrix = vectorizer.fit_transform(middle_sentences)
        middle_scores = tfidf_matrix.sum(axis=1).A1
        sorted_middle_indices = np.argsort(middle_scores)[::-1]
        for idx in sorted_middle_indices:
            sentence = middle_sentences[idx]
            tokens = tokenizer.encode(sentence, add_special_tokens=False)
            if current_token_count + len(tokens) <= max_input_length - 2:
                selected_middle_sentences.append(sentence)
                current_token_count += len(tokens)
            else:
                break

    truncated_introduction = " ".join(start_sentences + selected_middle_sentences + end_sentences)
    tokens = tokenizer(truncated_introduction, max_length=max_input_length, truncation=True, padding="max_length", return_tensors="pt")
    return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze(), max_input_length

# Dataset definitions
class TrainPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_target_length=800):
        self.data = data
        self.tokenizer = tokenizer
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_ids, attention_mask, max_input_length = truncate_from_ends_with_importance(item["introduction"], item["abstract"], self.tokenizer)
        targets = self.tokenizer(item["abstract"], max_length=self.max_target_length, truncation=True, padding="max_length", return_tensors="pt")
        target_ids = targets["input_ids"].squeeze()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids,
            "paper_id": item["paper_id"],
            "abstract": item["abstract"],
            "max_input_length": max_input_length
        }

# Collate function

def custom_collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    max_input_lengths = [item["max_input_length"] for item in batch]
    max_length = max(max_input_lengths)
    input_ids_padded = torch.zeros((len(batch), max_length), dtype=torch.long)
    attention_masks_padded = torch.zeros((len(batch), max_length), dtype=torch.long)
    for i in range(len(batch)):
        length = input_ids[i].size(0)
        input_ids_padded[i, :length] = input_ids[i]
        attention_masks_padded[i, :length] = attention_masks[i]
    labels = [item["labels"] for item in batch]
    labels_padded = torch.stack(labels)
    abstracts = [item["abstract"] for item in batch]
    return {
        "input_ids": input_ids_padded,
        "attention_mask": attention_masks_padded,
        "labels": labels_padded,
        "paper_id": [item["paper_id"] for item in batch],
        "abstract": abstracts,
        "max_input_length": max_input_lengths
    }

# Dataloader
train_dataset = TrainPaperDataset(train_data, tokenizer)
val_dataset = TrainPaperDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)

# Training settings
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
num_epochs = 300
eval_frequency = 5
scaler = GradScaler()

warm_up_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: min((epoch + 1) / 20, 1.0))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=150, eta_min=1e-6)

checkpoint_dir = "led-arxiv_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

best_val_loss = float('inf')
val_loss_window = []
window_size = 3
patience_counter = 0
patience = 100

baseline_scores = {"rouge1": 0.47, "rouge2": 0.12, "rougeL": 0.22, "bertscore_f1": 0.85}
best_rouge1 = 0.0
best_rouge2 = 0.0
best_rougeL = 0.0
best_bertscore_f1 = 0.0

metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bertscore = evaluate.load("bertscore")

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(tqdm(train_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss_fct = CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, label_smoothing=0.1)
            loss = loss_fct(outputs.logits.view(-1, outputs.logits.size(-1)), labels.view(-1))
            total_loss += loss.item()
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Average Training Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss:.4f}")
    val_loss_window.append(val_loss)
    if len(val_loss_window) > window_size:
        val_loss_window.pop(0)
    smoothed_val_loss = sum(val_loss_window) / len(val_loss_window)

    if smoothed_val_loss < best_val_loss:
        best_val_loss = smoothed_val_loss
        patience_counter = 0
        model.save_pretrained(os.path.join(checkpoint_dir, "led-arxiv_lora"))
        tokenizer.save_pretrained(os.path.join(checkpoint_dir, "led-arxiv_lora"))
        print("Best model saved.")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    # === ROUGE/BERTScore Evaluation every `eval_frequency` ===
    if (epoch + 1) % eval_frequency == 0:
        model.eval()
        predicted_abstracts = []
        reference_abstracts = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                reference_abstract = batch["abstract"][0]

                generated_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=800,
                    min_length=10,
                    num_beams=15,
                    length_penalty=1.0,
                    repetition_penalty=1.1,
                    early_stopping=True,
                    no_repeat_ngram_size=3
                )
                generated_abstract = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
                predicted_abstracts.append(generated_abstract)
                reference_abstracts.append(reference_abstract)

        rouge_scores = metric_rouge.compute(predictions=predicted_abstracts, references=reference_abstracts, use_stemmer=True)
        bert_scores = metric_bertscore.compute(predictions=predicted_abstracts, references=reference_abstracts, lang="en")

        current_rouge1 = rouge_scores['rouge1']
        current_rouge2 = rouge_scores['rouge2']
        current_rougeL = rouge_scores['rougeL']
        current_bertscore_f1 = np.mean(bert_scores['f1'])

        print("\n=== Validation Evaluation Results ===")
        print(f"ROUGE-1: {current_rouge1:.4f} (Baseline: {baseline_scores['rouge1']:.4f})")
        print(f"ROUGE-2: {current_rouge2:.4f} (Baseline: {baseline_scores['rouge2']:.4f})")
        print(f"ROUGE-L: {current_rougeL:.4f} (Baseline: {baseline_scores['rougeL']:.4f})")
        print(f"BERTScore F1: {current_bertscore_f1:.4f} (Baseline: {baseline_scores['bertscore_f1']:.4f})")

        if current_rouge1 > best_rouge1:
            best_rouge1 = current_rouge1
            best_rouge2 = current_rouge2
            best_rougeL = current_rougeL
            best_bertscore_f1 = current_bertscore_f1
            print(f"New best ROUGE-1: {best_rouge1:.4f}")

    warm_up_scheduler.step()
    scheduler.step()
    model.train()

# Save final model
model.save_pretrained(os.path.join(checkpoint_dir, "led-arxiv_lora_final"))
tokenizer.save_pretrained(os.path.join(checkpoint_dir, "led-arxiv_lora_final"))
print("Final model saved.")

In [None]:
import json
import torch
import re
import html
import numpy as np
import os
import spacy
from tqdm import tqdm
from transformers import LEDForConditionalGeneration, LEDTokenizer, PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import Dataset, DataLoader
from peft import PeftModel

# === 設定 ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 載入模型 ===
base_model = "allenai/led-base-16384"
tokenizer = LEDTokenizer.from_pretrained(base_model)
model = LEDForConditionalGeneration.from_pretrained(base_model)
model = PeftModel.from_pretrained(model, "led-arxiv_checkpoints/led-arxiv_lora")
model = model.to(device).eval()

pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large").to(device).eval()

flan_t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
flan_t5_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl").to(device).eval()

nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# === 工具函數 ===
def clean_text(text):
    # decode escape characters
    text = html.unescape(text)

    # 替換常見 unicode 標點為 ASCII（em dash、smart quotes）
    text = text.replace('\u2014', '-')      # em dash
    text = text.replace('\u2013', '-')      # en dash
    text = text.replace('\u201c', '"').replace('\u201d', '"')  # quotes
    text = text.replace('\u2018', "'").replace('\u2019', "'")  # apostrophes

    # 移除 LaTeX 符號，但保留括號內文字
    text = re.sub(r'\$\$.*?\$\$', '', text)
    text = re.sub(r'\$([^\$]*?)\$', r'\1', text)  # 保留公式文字內容

    # 移除文獻引用，例如 [1], [2, 3]
    text = re.sub(r'\[(\d+|\d+,\s*\d+)\]', '', text)

    # 移除圖表描述，但保留章節參考
    text = re.sub(r'(Fig\.|Figure)\s*\d+[a-zA-Z]?(.*?)?(\.|\s|$)', '', text)
    text = re.sub(r'Table\s*\d+[a-zA-Z]?(.*?)?(\.|\s|$)', '', text)

    # 避免把 "-" 破壞，例如 DP-SGD, self-supervised
    # 只清除非語意相關符號
    text = re.sub(r'[^a-zA-Z0-9\s.,!?()/%=:\-+<>_\[\]\"\'’]', ' ', text)

    # 去除多餘空白
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def extract_keywords(text, top_n=15):
    tfidf = TfidfVectorizer(stop_words="english", max_features=100)
    matrix = tfidf.fit_transform([text])
    scores = matrix.toarray()[0]
    feature_names = tfidf.get_feature_names_out()
    top_indices = scores.argsort()[-top_n:][::-1]
    return [feature_names[i] for i in top_indices if len(feature_names[i]) > 2]

def ensure_keywords_in_abstract(abstract, intro, top_n=15):
    keywords = extract_keywords(intro, top_n)
    abstract = clean_text(abstract)
    doc = nlp(abstract)
    sentences = [sent.text for sent in doc.sents]
    missing = [kw for kw in keywords if kw.lower() not in abstract.lower()]
    if not missing:
        return abstract
    
    for kw in missing[:5]:
        for i, sent in enumerate(sentences):
            if len(sent.split()) > 5:
                sentences[i] = f"{sent.rstrip('.')} including {kw}."
                break
            elif i == len(sentences) - 1:
                sentences[i] += f" {kw} is considered."
    return " ".join(sentences)

def enhance_with_pegasus(text):
    text = clean_text(text)
    inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = pegasus_model.generate(
        **inputs,
        max_length=512,
        min_length=150,
        num_beams=8,
        length_penalty=1.0,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
    abstract = pegasus_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_text(abstract)

def refine_with_flan_t5(intro, abstract):
    # 使用 Flan-T5 修正摘要，確保與原文一致
    prompt = f"summarize and refine the following text to make it concise and accurate:\nIntroduction: {intro[:1000]}\nGenerated abstract: {abstract}"
    inputs = flan_t5_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=600).to(device)
    outputs = flan_t5_model.generate(
        **inputs,
        max_length=600,  # Flan-T5 傾向生成較精簡的內容
        min_length=150,
        num_beams=6,    # 增加 beam 數以提升品質
        length_penalty=0.8,  # 稍微偏向較短輸出
        repetition_penalty=1.2,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    refined_abstract = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_text(refined_abstract)

# === 載入資料 ===
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
    return data

def truncate_from_ends_with_importance(introduction, tokenizer, max_length=8192):
    doc = nlp(introduction)
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]
    if len(sentences) <= 1:
        encoded = tokenizer(introduction, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
        return encoded["input_ids"].squeeze(0), encoded["attention_mask"].squeeze(0)

    embeddings = embedder.encode(sentences, convert_to_tensor=True)
    intro_embedding = embedder.encode(introduction, convert_to_tensor=True)
    similarities = cosine_similarity(embeddings.cpu().numpy(), intro_embedding.cpu().numpy().reshape(1, -1)).flatten()
    weights = np.linspace(1.5, 1.0, len(sentences))
    scores = similarities * weights

    keywords = extract_keywords(introduction, top_n=5)
    intro_with_keywords = f"Keywords: {', '.join(keywords)}. {' '.join(sentences)}"

    sorted_idx = np.argsort(scores)[::-1]
    selected = []
    cur_len = 0
    for i in sorted_idx:
        t = tokenizer.encode(sentences[i], add_special_tokens=False)
        if cur_len + len(t) <= max_length - len(tokenizer.encode(f"Keywords: {', '.join(keywords)}. ", add_special_tokens=False)) - 2:
            selected.append(sentences[i])
            cur_len += len(t)
        else:
            break

    truncated = f"Keywords: {', '.join(keywords)}. {' '.join(selected)}"
    encoded = tokenizer(truncated, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
    return encoded["input_ids"].squeeze(0), encoded["attention_mask"].squeeze(0)

class TestPaperDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_ids, attention_mask = truncate_from_ends_with_importance(item["introduction"], self.tokenizer)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "paper_id": item["paper_id"],
            "introduction": item["introduction"]
        }

# === 開始推理 ===
test_data = load_json("test.json")
test_dataset = TestPaperDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1)

submission = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids
        attention_mask = attention_mask.unsqueeze(0) if attention_mask.dim() == 1 else attention_mask

        paper_id_raw = batch["paper_id"][0]
        paper_id = re.search(r'\d+', str(paper_id_raw)).group()
        intro = batch["introduction"][0]

        # LED 生成
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            num_beams=10,
            max_length=1000,
            min_length=200,
            length_penalty=1.3,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        abstract = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        abstract = clean_text(abstract)

        # PEGASUS 增強
        abstract = enhance_with_pegasus(abstract)

        # 確保關鍵詞
        abstract = ensure_keywords_in_abstract(abstract, intro)

        # Flan-T5 修正
        abstract = refine_with_flan_t5(intro, abstract)

        submission.append({
            "paper_id": paper_id,
            "abstract": abstract
        })

with open("submission_led.json", "w") as f:
    for item in submission:
        f.write(json.dumps(item) + "\n")

print(f"Saved {len(submission)} abstracts to submission_led.json")

第二版

In [None]:
import json
import torch
import re
import html
import os
import numpy as np
import spacy
from torch.utils.data import Dataset, DataLoader
from transformers import LEDForConditionalGeneration, LEDTokenizer
from peft import LoraConfig, get_peft_model
from torch.cuda.amp import autocast, GradScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import evaluate

# Load spacy model
nlp = spacy.load("en_core_web_sm")

# Clean LaTeX and noise
def clean_text(text):
    # decode escape characters
    text = html.unescape(text)

    # 替換常見 unicode 標點為 ASCII（em dash、smart quotes）
    text = text.replace('\u2014', '-')      # em dash
    text = text.replace('\u2013', '-')      # en dash
    text = text.replace('\u201c', '"').replace('\u201d', '"')  # quotes
    text = text.replace('\u2018', "'").replace('\u2019', "'")  # apostrophes

    # 移除 LaTeX 符號，但保留括號內文字
    text = re.sub(r'\$\$.*?\$\$', '', text)
    text = re.sub(r'\$([^\$]*?)\$', r'\1', text)  # 保留公式文字內容

    # 移除文獻引用，例如 [1], [2, 3]
    text = re.sub(r'\[(\d+|\d+,\s*\d+)\]', '', text)

    # 移除圖表描述，但保留章節參考
    text = re.sub(r'(Fig\.|Figure)\s*\d+[a-zA-Z]?(.*?)?(\.|\s|$)', '', text)
    text = re.sub(r'Table\s*\d+[a-zA-Z]?(.*?)?(\.|\s|$)', '', text)

    # 避免把 "-" 破壞，例如 DP-SGD, self-supervised
    # 只清除非語意相關符號
    text = re.sub(r'[^a-zA-Z0-9\s.,!?()/%=:\-+<>_\[\]\"\'’]', ' ', text)

    # 去除多餘空白
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load LED base model and tokenizer
model_name = "allenai/led-base-16384"
tokenizer = LEDTokenizer.from_pretrained(model_name)
model = LEDForConditionalGeneration.from_pretrained(model_name)

# Apply LoRA
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config)
model = model.to(device)
model.print_trainable_parameters()

# Load dataset
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

train_data = load_json("train.json")
test_data = load_json("test.json")
val_size = int(0.1 * len(train_data))
val_data = train_data[-val_size:]
train_data = train_data[:-val_size]

# Truncation function
def truncate_full_text_only(introduction, abstract=None, tokenizer=tokenizer, max_model_length=16384, attention_window=1024):
    tokens = tokenizer.encode(introduction, add_special_tokens=True)
    token_length = len(tokens)
    min_length = 128

    max_input_length = min(max_model_length, max(min_length, token_length))
    max_input_length = ((max_input_length + attention_window - 1) // attention_window) * attention_window

    tokens = tokenizer(introduction, truncation=True, max_length=max_input_length, padding="max_length", return_tensors="pt")
    return tokens["input_ids"].squeeze(), tokens["attention_mask"].squeeze(), max_input_length

# Dataset definitions
class TrainPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_target_length=800):
        self.data = data
        self.tokenizer = tokenizer
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_ids, attention_mask, max_input_length = truncate_full_text_only(item["introduction"], item["abstract"], self.tokenizer)
        targets = self.tokenizer(item["abstract"], max_length=self.max_target_length, truncation=True, padding="max_length", return_tensors="pt")
        target_ids = targets["input_ids"].squeeze()
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids,
            "paper_id": item["paper_id"],
            "abstract": item["abstract"],
            "max_input_length": max_input_length
        }

# Collate function

def custom_collate_fn(batch):
    input_ids = [item["input_ids"] for item in batch]
    attention_masks = [item["attention_mask"] for item in batch]
    max_input_lengths = [item["max_input_length"] for item in batch]
    max_length = max(max_input_lengths)
    input_ids_padded = torch.zeros((len(batch), max_length), dtype=torch.long)
    attention_masks_padded = torch.zeros((len(batch), max_length), dtype=torch.long)
    for i in range(len(batch)):
        length = input_ids[i].size(0)
        input_ids_padded[i, :length] = input_ids[i]
        attention_masks_padded[i, :length] = attention_masks[i]
    labels = [item["labels"] for item in batch]
    labels_padded = torch.stack(labels)
    abstracts = [item["abstract"] for item in batch]
    return {
        "input_ids": input_ids_padded,
        "attention_mask": attention_masks_padded,
        "labels": labels_padded,
        "paper_id": [item["paper_id"] for item in batch],
        "abstract": abstracts,
        "max_input_length": max_input_lengths
    }

# Dataloader
train_dataset = TrainPaperDataset(train_data, tokenizer)
val_dataset = TrainPaperDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=custom_collate_fn)
gradient_accumulation_steps = 2  

# Training settings
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 200
eval_frequency = 10
scaler = GradScaler()

warm_up_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: min((epoch + 1) / 20, 1.0))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=280, eta_min=1e-6)

checkpoint_dir = "led-arxiv_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

best_val_loss = float('inf')
val_loss_window = []
window_size = 3
patience_counter = 0
patience = 100

baseline_scores = {"rouge1": 0.47, "rouge2": 0.12, "rougeL": 0.22, "bertscore_f1": 0.85}
best_rouge1 = 0.0
best_rouge2 = 0.0
best_rougeL = 0.0
best_bertscore_f1 = 0.0

metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bertscore = evaluate.load("bertscore")

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(tqdm(train_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Average Training Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss:.4f}")
    val_loss_window.append(val_loss)
    if len(val_loss_window) > window_size:
        val_loss_window.pop(0)
    smoothed_val_loss = sum(val_loss_window) / len(val_loss_window)

    if smoothed_val_loss < best_val_loss:
        best_val_loss = smoothed_val_loss
        patience_counter = 0
        model.save_pretrained(os.path.join(checkpoint_dir, "led-arxiv_lora"))
        tokenizer.save_pretrained(os.path.join(checkpoint_dir, "led-arxiv_lora"))
        print("Best model saved.")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            break

    # === ROUGE/BERTScore Evaluation every `eval_frequency` ===
    if (epoch + 1) % eval_frequency == 0:
        model.eval()
        predicted_abstracts = []
        reference_abstracts = []

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                reference_abstract = batch["abstract"][0]

                generated_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=800,
                    min_length=100,
                    num_beams=15,
                    length_penalty=1.0,
                    repetition_penalty=0.9,
                    early_stopping=True,
                    no_repeat_ngram_size=3
                )
                generated_abstract = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
                predicted_abstracts.append(generated_abstract)
                reference_abstracts.append(reference_abstract)

        rouge_scores = metric_rouge.compute(predictions=predicted_abstracts, references=reference_abstracts, use_stemmer=True)
        bert_scores = metric_bertscore.compute(predictions=predicted_abstracts, references=reference_abstracts, lang="en")

        current_rouge1 = rouge_scores['rouge1']
        current_rouge2 = rouge_scores['rouge2']
        current_rougeL = rouge_scores['rougeL']
        current_bertscore_f1 = np.mean(bert_scores['f1'])

        print("\n=== Validation Evaluation Results ===")
        print(f"ROUGE-1: {current_rouge1:.4f} (Baseline: {baseline_scores['rouge1']:.4f})")
        print(f"ROUGE-2: {current_rouge2:.4f} (Baseline: {baseline_scores['rouge2']:.4f})")
        print(f"ROUGE-L: {current_rougeL:.4f} (Baseline: {baseline_scores['rougeL']:.4f})")
        print(f"BERTScore F1: {current_bertscore_f1:.4f} (Baseline: {baseline_scores['bertscore_f1']:.4f})")

        if current_rouge1 > best_rouge1:
            best_rouge1 = current_rouge1
            best_rouge2 = current_rouge2
            best_rougeL = current_rougeL
            best_bertscore_f1 = current_bertscore_f1
            print(f"New best ROUGE-1: {best_rouge1:.4f}")

    warm_up_scheduler.step()
    scheduler.step()
    model.train()

# Save final model
model.save_pretrained(os.path.join(checkpoint_dir, "led-arxiv_lora_final"))
tokenizer.save_pretrained(os.path.join(checkpoint_dir, "led-arxiv_lora_final"))
print("Final model saved.")

In [None]:
import json
import torch
import re
import html
import numpy as np
import os
import spacy
from tqdm import tqdm
from transformers import LEDForConditionalGeneration, LEDTokenizer, PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from torch.utils.data import Dataset, DataLoader
from peft import PeftModel

# === 設定 ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === 載入模型 ===
base_model = "allenai/led-base-16384"
tokenizer = LEDTokenizer.from_pretrained(base_model)
model = LEDForConditionalGeneration.from_pretrained(base_model)
model = PeftModel.from_pretrained(model, "led-arxiv_checkpoints/led-arxiv_lora")
model = model.to(device).eval()

pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large").to(device).eval()

flan_t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
flan_t5_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl").to(device).eval()

nlp = spacy.load("en_core_web_sm")
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# === 工具函數 ===
def clean_text(text):
    # decode escape characters
    text = html.unescape(text)

    # 替換常見 unicode 標點為 ASCII（em dash、smart quotes）
    text = text.replace('\u2014', '-')      # em dash
    text = text.replace('\u2013', '-')      # en dash
    text = text.replace('\u201c', '"').replace('\u201d', '"')  # quotes
    text = text.replace('\u2018', "'").replace('\u2019', "'")  # apostrophes

    # 移除 LaTeX 符號，但保留括號內文字
    text = re.sub(r'\$\$.*?\$\$', '', text)
    text = re.sub(r'\$([^\$]*?)\$', r'\1', text)  # 保留公式文字內容

    # 移除文獻引用，例如 [1], [2, 3]
    text = re.sub(r'\[(\d+|\d+,\s*\d+)\]', '', text)

    # 移除圖表描述，但保留章節參考
    text = re.sub(r'(Fig\.|Figure)\s*\d+[a-zA-Z]?(.*?)?(\.|\s|$)', '', text)
    text = re.sub(r'Table\s*\d+[a-zA-Z]?(.*?)?(\.|\s|$)', '', text)

    # 避免把 "-" 破壞，例如 DP-SGD, self-supervised
    # 只清除非語意相關符號
    text = re.sub(r'[^a-zA-Z0-9\s.,!?()/%=:\-+<>_\[\]\"\'’]', ' ', text)

    # 去除多餘空白
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def extract_keywords(text, top_n=15):
    tfidf = TfidfVectorizer(stop_words="english", max_features=100)
    matrix = tfidf.fit_transform([text])
    scores = matrix.toarray()[0]
    feature_names = tfidf.get_feature_names_out()
    top_indices = scores.argsort()[-top_n:][::-1]
    return [feature_names[i] for i in top_indices if len(feature_names[i]) > 2]

def ensure_keywords_in_abstract(abstract, intro, top_n=15):
    keywords = extract_keywords(intro, top_n)
    abstract = clean_text(abstract)
    doc = nlp(abstract)
    sentences = [sent.text for sent in doc.sents]
    missing = [kw for kw in keywords if kw.lower() not in abstract.lower()]
    if not missing:
        return abstract
    
    for kw in missing[:5]:
        for i, sent in enumerate(sentences):
            if len(sent.split()) > 5:
                sentences[i] = f"{sent.rstrip('.')} including {kw}."
                break
            elif i == len(sentences) - 1:
                sentences[i] += f" {kw} is considered."
    return " ".join(sentences)

def enhance_with_pegasus(text):
    text = clean_text(text)
    inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).to(device)
    outputs = pegasus_model.generate(
        **inputs,
        max_length=700,
        min_length=150,
        num_beams=8,
        length_penalty=1.0,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3,
        early_stopping=True
    )
    abstract = pegasus_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_text(abstract)

def refine_with_flan_t5(intro, abstract):
    # 使用 Flan-T5 修正摘要，確保與原文一致
    prompt = f"summarize and refine the following text to make it concise and accurate:\nIntroduction: {intro[:1000]}\nGenerated abstract: {abstract}"
    inputs = flan_t5_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=600).to(device)
    outputs = flan_t5_model.generate(
        **inputs,
        max_length=600,  # Flan-T5 傾向生成較精簡的內容
        min_length=150,
        num_beams=6,    # 增加 beam 數以提升品質
        length_penalty=0.8,  # 稍微偏向較短輸出
        repetition_penalty=1.2,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    refined_abstract = flan_t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return clean_text(refined_abstract)

# === 載入資料 ===
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
    return data

def truncate_from_ends_with_importance(introduction, tokenizer, max_length=8192):
    doc = nlp(introduction)
    sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]
    if len(sentences) <= 1:
        encoded = tokenizer(introduction, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
        return encoded["input_ids"].squeeze(0), encoded["attention_mask"].squeeze(0)

    embeddings = embedder.encode(sentences, convert_to_tensor=True)
    intro_embedding = embedder.encode(introduction, convert_to_tensor=True)

    weights = np.linspace(1.5, 1.0, len(sentences))
    scores = torch.nn.functional.cosine_similarity(embeddings, intro_embedding.unsqueeze(0), dim=1).cpu().numpy() * weights

    keywords = extract_keywords(introduction, top_n=5)
    sorted_idx = np.argsort(scores)[::-1]
    selected = []
    cur_len = 0
    for i in sorted_idx:
        t = tokenizer.encode(sentences[i], add_special_tokens=False)
        if cur_len + len(t) <= max_length - 20:  # 預留 keywords 長度
            selected.append(sentences[i])
            cur_len += len(t)
        else:
            break

    truncated = f"Keywords: {', '.join(keywords)}. {' '.join(selected)}"
    encoded = tokenizer(truncated, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
    return encoded["input_ids"].squeeze(0), encoded["attention_mask"].squeeze(0)


class TestPaperDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_ids, attention_mask = truncate_from_ends_with_importance(item["introduction"], self.tokenizer)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "paper_id": item["paper_id"],
            "introduction": item["introduction"]
        }

# === 開始推理 ===
test_data = load_json("test.json")
test_dataset = TestPaperDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1)

submission = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        input_ids = input_ids.unsqueeze(0) if input_ids.dim() == 1 else input_ids
        attention_mask = attention_mask.unsqueeze(0) if attention_mask.dim() == 1 else attention_mask

        paper_id_raw = batch["paper_id"][0]
        paper_id = re.search(r'\d+', str(paper_id_raw)).group()
        intro = batch["introduction"][0]

        # === LED 生成初稿 ===
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            num_beams=6,
            max_length=800,
            min_length=150,
            length_penalty=1.0,
            repetition_penalty=1.0,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
        abstract = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        abstract = clean_text(abstract)

        # === 條件性觸發 PEGASUS 強化 ===
        if len(abstract.split()) < 150 or 'our method' not in abstract.lower():
            abstract = enhance_with_pegasus(abstract)

        # === 強化關鍵詞 ===
        abstract = ensure_keywords_in_abstract(abstract, intro)

        # === 根據語意相似度或關鍵詞缺失決定是否 T5 修正 ===
        intro_embed = embedder.encode(intro, convert_to_tensor=True)
        abs_embed = embedder.encode(abstract, convert_to_tensor=True)
        similarity = torch.nn.functional.cosine_similarity(intro_embed, abs_embed, dim=0).item()

        missing_keywords = [kw for kw in extract_keywords(intro)[:5] if kw not in abstract.lower()]

        if similarity < 0.6 or len(missing_keywords) > 0:
            abstract = refine_with_flan_t5(intro, abstract)

        submission.append({
            "paper_id": paper_id,
            "abstract": abstract
        })

with open("submission_led.json", "w") as f:
    for item in submission:
        f.write(json.dumps(item) + "\n")

print(f"Saved {len(submission)} abstracts to submission_led.json")

# LontT5

In [None]:
import os
import json
import torch
import re
import html
import numpy as np
import spacy
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, LongT5ForConditionalGeneration
from peft import LoraConfig, get_peft_model
from torch.cuda.amp import autocast, GradScaler
import evaluate

# === 基本設定 ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
nlp = spacy.load("en_core_web_sm")

# === 清理文字 ===
def clean_text(text):
    text = html.unescape(text)
    text = text.replace('\u2014', '-')
    text = text.replace('\u2013', '-')
    text = text.replace('\u201c', '"').replace('\u201d', '"')
    text = text.replace('\u2018', "'").replace('\u2019', "'")
    text = re.sub(r'\$\$.*?\$\$', '', text)
    text = re.sub(r'\$([^\$]*?)\$', r'\1', text)
    text = re.sub(r'\[(\d+|\d+,\s*\d+)\]', '', text)
    text = re.sub(r'(Fig\.|Figure)\s*\d+[a-zA-Z]?(.*?)?(\.|\s|$)', '', text)
    text = re.sub(r'Table\s*\d+[a-zA-Z]?(.*?)?(\.|\s|$)', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?()/%=:\-+<>_\[\]"\'’]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# === 載入模型與 Tokenizer ===
model_name = "google/long-t5-tglobal-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = LongT5ForConditionalGeneration.from_pretrained(model_name)

lora_config = LoraConfig(
    r=64,
    lora_alpha=64,
    target_modules=["q", "k", "v", "o", "wi", "wo"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, lora_config).to(device)
model.print_trainable_parameters()

# === 載入資料集 ===
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

train_data = load_json("train.json")
test_data = load_json("test.json")
val_size = int(0.1 * len(train_data))
val_data = train_data[-val_size:]
train_data = train_data[:-val_size]

# === Tokenize function ===
def tokenize_inputs(intro, tokenizer, max_length=7000):
    tokens = tokenizer(intro, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    return tokens["input_ids"].squeeze(0), tokens["attention_mask"].squeeze(0)

# === Dataset 定義 ===
class PaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_target_length=712):
        self.data = data
        self.tokenizer = tokenizer
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_ids, attention_mask = tokenize_inputs(item["introduction"], self.tokenizer)
        target = self.tokenizer(item["abstract"], max_length=self.max_target_length, truncation=True, padding="max_length", return_tensors="pt")
        labels = target["input_ids"].squeeze(0)
        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
            "paper_id": item["paper_id"],
            "abstract": item["abstract"]
        }

# === Collate function ===
def collate_fn(batch):
    input_ids = torch.stack([x["input_ids"] for x in batch])
    attention_mask = torch.stack([x["attention_mask"] for x in batch])
    labels = torch.stack([x["labels"] for x in batch])
    abstracts = [x["abstract"] for x in batch]
    paper_ids = [x["paper_id"] for x in batch]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
        "abstract": abstracts,
        "paper_id": paper_ids
    }

train_loader = DataLoader(PaperDataset(train_data, tokenizer), batch_size=1, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(PaperDataset(val_data, tokenizer), batch_size=1, shuffle=False, collate_fn=collate_fn)

# === 訓練設定 ===
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scaler = GradScaler()
num_epochs = 200
gradient_accumulation_steps = 10

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=1e-6)
metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bertscore = evaluate.load("bertscore")

checkpoint_dir = "longt5-arxiv_lora"
os.makedirs(checkpoint_dir, exist_ok=True)
best_rouge1 = 0.0

# === 訓練迴圈 ===
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    optimizer.zero_grad()
    total_loss = 0

    for step, batch in enumerate(tqdm(train_loader)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / gradient_accumulation_steps
        scaler.scale(loss).backward()
        total_loss += loss.item()

        if (step + 1) % gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

    scheduler.step()
    print(f"Train Loss: {total_loss/len(train_loader):.4f}")

    # 驗證階段
    model.eval()
    predictions, references, paper_ids = [], [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            reference = batch["abstract"][0]
            pid = batch["paper_id"][0]

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=1024,
                min_length=200,
                num_beams=8, 
                length_penalty=1.2,
                no_repeat_ngram_size=3,
                early_stopping=True
            )
            pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            predictions.append(pred)
            references.append(reference)
            paper_ids.append(pid)

    rouge = metric_rouge.compute(predictions=predictions, references=references, use_stemmer=True)
    bert = metric_bertscore.compute(predictions=predictions, references=references, lang="en")
    rouge1 = rouge["rouge1"]
    print(f"ROUGE-1: {rouge1:.4f}, BERTScore F1: {np.mean(bert['f1']):.4f}")

    if rouge1 > best_rouge1:
        best_rouge1 = rouge1
        model.save_pretrained(os.path.join(checkpoint_dir, "best"))
        tokenizer.save_pretrained(os.path.join(checkpoint_dir, "best"))
        print("Best model saved.")

    model.train()

# 最後儲存
model.save_pretrained(os.path.join(checkpoint_dir, "final"))
tokenizer.save_pretrained(os.path.join(checkpoint_dir, "final"))
print("Training completed and model saved.")


# 方向2、Large Language Model

英文文本：LLaMA-2 13B + Axolotl + PEFT、Mamba、Gemma

中文文本：Qwen2 + QLoRA

# Gemma

In [None]:
pip install torch==2.1.2 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
pip install transformers peft accelerate evaluate

In [None]:
pip install bitsandbytes -f https://huggingface.github.io/bitsandbytes-packages/torch211_cu118.html

In [None]:
pip install spacy nltk rouge-score bert-score

In [None]:
import nltk
nltk.download('punkt')  # 用於斷句和 tokenize

In [None]:
pip install absl-py

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import json
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig
from transformers import default_data_collator
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import spacy
import os
import evaluate
from torch.cuda.amp import autocast, GradScaler

# Load spacy model
nlp = spacy.load("en_core_web_sm")

# Clean LaTeX and noise
def clean_text(text):
    text = re.sub(r'\\[a-zA-Z]+\{.*?\}', '', text)  # LaTeX commands
    text = re.sub(r'\$\$.*?\$\$', '', text)         # $$ math $$
    text = re.sub(r'\$.*?\$', '', text)               # $math$
    text = re.sub(r'\[\d+(,\s*\d+)*\]', '', text)  # [1], [1, 2]
    text = re.sub(r'(Fig\.|Figure)\s*\d+.*?(?=\.|$)', '', text)
    text = re.sub(r'Table\s*\d+.*?(?=\.|$)', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,;:!?()\-]', ' ', text)  # remove rare symbols but keep colons etc
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model/tokenizer with QLoRA
model_name = "google/gemma-7b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Optimized LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load dataset
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

# Custom Dataset
class TrainPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=8192):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        introduction = item["introduction"]
        abstract = item["abstract"]

        # Enhanced CoT-style prompt
        input_text = f"""
You are a scientific writing assistant trained to write high-quality research abstracts.
Your task is to analyze the given introduction of a computer science or artificial intelligence paper and generate a clear, structured, and academic abstract.

Use professional and academic language. Maintain coherence and conciseness. The abstract should be approximately 150–300 words.

[Introduction]
{introduction}

[Abstract]"""
        full_text = input_text + f" {abstract}"

        tokens = self.tokenizer(full_text, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")
        input_ids = tokens["input_ids"].squeeze()
        attention_mask = tokens["attention_mask"].squeeze()

        abstract_start = input_text.count("\n")  # Rough offset fallback if needed
        labels = input_ids.clone()
        labels[:len(self.tokenizer(input_text, return_tensors="pt")["input_ids"].squeeze())] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
            "paper_id": item["paper_id"],
            "abstract": abstract,
        }

def custom_collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_masks = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    abstracts = [item["abstract"] for item in batch]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
        "paper_id": [item["paper_id"] for item in batch],
        "abstract": abstracts,
    }
    
# 載入資料與切分訓練 / 驗證集
train_data = load_json("train.json")   # 假設你的訓練資料檔名為 train.json
val_size = int(0.2 * len(train_data))  # 使用 20% 當作驗證集
val_data = train_data[-val_size:]
train_data = train_data[:-val_size]

test_data = load_json("test.json") 
    

train_dataset = TrainPaperDataset(train_data, tokenizer)
val_dataset = TrainPaperDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=default_data_collator)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=default_data_collator)

# Generation config with stop token handling
generation_config = GenerationConfig(
    max_length=800,
    min_length=150,
    num_beams=5,
    length_penalty=1.0,
    repetition_penalty=1.1,
    no_repeat_ngram_size=2,
    early_stopping=True,
    eos_token_id=tokenizer.eos_token_id
)

# Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=1e-2)
scaler = GradScaler()

baseline_scores = {"rouge1": 0.47, "rouge2": 0.12, "rougeL": 0.22, "bertscore_f1": 0.85}
checkpoint_dir = "gemma3-12b_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

patience = 30
patience_counter = 0
best_val_loss = float('inf')
best_rouge1 = 0.0

for epoch in range(100):
    print(f"Epoch {epoch+1}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Training Loss: {avg_loss:.4f}")

    # Validation
    model.eval()
    val_loss = 0
    predicted_abstracts = []
    reference_abstracts = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()

            gen_input = input_ids[0].unsqueeze(0)
            generated_ids = model.generate(gen_input, generation_config=generation_config)
            decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
            generated = decoded.split("### Abstract:")[-1].strip()
            predicted_abstracts.append(generated)
            reference_abstracts.append(batch["abstract"][0])

    val_loss /= len(val_loader)
    print(f"Validation Loss: {val_loss:.4f}")

    rouge = evaluate.load("rouge")
    bertscore = evaluate.load("bertscore")
    rouge_scores = rouge.compute(predictions=predicted_abstracts, references=reference_abstracts, use_stemmer=True)
    bert_scores = bertscore.compute(predictions=predicted_abstracts, references=reference_abstracts, lang="en")
    current_rouge1 = rouge_scores['rouge1']
    current_bertscore = np.mean(bert_scores['f1'])

    print(f"ROUGE-1: {current_rouge1:.4f}, BERTScore-F1: {current_bertscore:.4f}")

    if val_loss < best_val_loss or current_rouge1 > best_rouge1:
        best_val_loss = val_loss
        best_rouge1 = current_rouge1
        patience_counter = 0
        model.save_pretrained(os.path.join(checkpoint_dir, "best_model"))
        print("Model saved.")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping.")
            break

改

In [None]:
import json
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig
from transformers import default_data_collator
from peft import LoraConfig, get_peft_model
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import spacy
import os
import evaluate
from torch.cuda.amp import autocast, GradScaler

# Load spacy model
nlp = spacy.load("en_core_web_sm")

# Clean LaTeX and noise
def clean_text(text):
    text = re.sub(r'\\[a-zA-Z]+\{.*?\}', '', text)  # LaTeX commands
    text = re.sub(r'\$\$.*?\$\$', '', text)         # $$ math $$
    text = re.sub(r'\$.*?\$', '', text)               # $math$
    text = re.sub(r'\[\d+(,\s*\d+)*\]', '', text)  # [1], [1, 2]
    text = re.sub(r'(Fig\.|Figure)\s*\d+.*?(?=\.|$)', '', text)
    text = re.sub(r'Table\s*\d+.*?(?=\.|$)', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,;:!?()\-]', ' ', text)  # remove rare symbols but keep colons etc
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model/tokenizer with QLoRA
model_name = "google/gemma-7b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Optimized LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Load dataset
def load_json(file_path):
    with open(file_path, "r") as f:
        data = [json.loads(line) for line in f]
    for item in data:
        item["introduction"] = clean_text(item["introduction"])
        if "abstract" in item:
            item["abstract"] = clean_text(item["abstract"])
    return data

# Custom Dataset
class TrainPaperDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=1024):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        intro = item["introduction"]
        abstract = item["abstract"]

        # CoT prompt
        prompt_prefix = (
            "You are a scientific writing assistant trained to write high-quality research abstracts.\n"
            "Your task is to analyze the given introduction of a computer science or artificial intelligence paper"
            "and generate a clear, structured, and academic abstract.\n\n"
            "Use professional and academic language. Maintain coherence and conciseness."
            "The abstract should be approximately 150–300 words.\n\n"
            "And learn how to start the first sentence of most abstracts.\n\n"
            "[Introduction]\n"
        )
        prompt_suffix = "\n\n[Abstract]"

        # tokenize introduction separately and truncate
        intro_tokens = self.tokenizer(prompt_prefix + intro, truncation=True, max_length=1536, return_tensors="pt")
        prompt_input_ids = intro_tokens["input_ids"].squeeze(0)
        attention_mask = intro_tokens["attention_mask"].squeeze(0)

        # concatenate prompt + abstract for labels
        full_text = self.tokenizer.decode(prompt_input_ids, skip_special_tokens=True) + prompt_suffix + " " + abstract
        full_tokens = self.tokenizer(full_text, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")

        input_ids = full_tokens["input_ids"].squeeze(0)
        attention_mask = full_tokens["attention_mask"].squeeze(0)

        # 計算 abstract 起始位置，masked labels
        label_cutoff = self.tokenizer(full_text.split("[Abstract]")[0], return_tensors="pt")["input_ids"].size(1)
        labels = input_ids.clone()
        labels[:label_cutoff] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
            "paper_id": item["paper_id"],
            "abstract": abstract,
    }

def custom_collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_masks = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    abstracts = [item["abstract"] for item in batch]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
        "paper_id": [item["paper_id"] for item in batch],
        "abstract": abstracts,
    }
    
# 載入資料與切分訓練 / 驗證集
train_data = load_json("train.json")   
val_size = int(0.1 * len(train_data))  # 使用 10% 當作驗證集
val_data = train_data[-val_size:]
train_data = train_data[:-val_size]

test_data = load_json("test.json") 
    

train_dataset = TrainPaperDataset(train_data, tokenizer)
val_dataset = TrainPaperDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=custom_collate_fn)

# Generation config with stop token handling
generation_config = GenerationConfig(
    max_new_tokens=300,
    min_length=150,
    num_beams=5,
    length_penalty=1.0,
    repetition_penalty=1.1,
    no_repeat_ngram_size=2,
    early_stopping=True,
    eos_token_id=tokenizer.eos_token_id
)

# Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6, weight_decay=1e-2)
scaler = GradScaler()

baseline_scores = {"rouge1": 0.47, "rouge2": 0.12, "rougeL": 0.22, "bertscore_f1": 0.85}
checkpoint_dir = "gemma-7b_checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

patience = 30
patience_counter = 0
best_val_loss = float('inf')
best_rouge1 = 0.0

for epoch in range(100):
    print(f"Epoch {epoch+1}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        if torch.all(labels == -100):
            continue

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Training Loss: {avg_loss:.4f}")

    # Validation
    if (epoch + 1) % 10 == 0:
        print(f"Starting validation at Epoch {epoch+1}")
        model.eval()
        val_loss = 0
        predicted_abstracts = []
        reference_abstracts = []
        with torch.no_grad():
            for i, batch in enumerate(val_loader):
                print(f"[Validation] Processing batch {i+1}/{len(val_loader)}")  
                
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                with autocast():
                    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                    val_loss += outputs.loss.item()

                # 生成摘要
                gen_input = input_ids[0].unsqueeze(0)
                gen_input = gen_input[:, -1024:]  # 截斷到最後 1024 個 token
                generated_ids = model.generate(gen_input, generation_config=generation_config)
                decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

                # 提取 [Abstract] 後的內容
                if "[Abstract]" in decoded:
                    generated = decoded.split("[Abstract]")[-1].strip()
                else:
                    # 如果沒有 [Abstract]，嘗試移除提示詞部分
                    prompt_end = "Use professional and academic language. Maintain coherence and conciseness. The abstract should be approximately 150–300 words."
                    if prompt_end in decoded:
                        generated = decoded.split(prompt_end)[-1].strip()
                    else:
                        generated = decoded  # 如果無法分割，保留全文並記錄警告
                        print(f"Paper {batch['paper_id'][0]}: 無法正確提取摘要，使用完整生成內容")

                # 移除換行符號並規範化空格
                generated = generated.replace("\n", " ").strip()
                generated = re.sub(r'\s+', ' ', generated)

                predicted_abstracts.append(generated)
                reference_abstracts.append(batch["abstract"][0])

        # 計算評估指標
        rouge = evaluate.load("rouge")
        bertscore = evaluate.load("bertscore")
        rouge_scores = rouge.compute(predictions=predicted_abstracts, references=reference_abstracts, use_stemmer=True)
        bert_scores = bertscore.compute(predictions=predicted_abstracts, references=reference_abstracts, lang="en")
        current_rouge1 = rouge_scores['rouge1']
        current_rouge2 = rouge_scores['rouge2']
        current_rougeL = rouge_scores['rougeL']
        current_bertscore = np.mean(bert_scores['f1'])

        print(f"ROUGE-1: {current_rouge1:.4f}, ROUGE-2: {current_rouge2:.4f}, ROUGE-L: {current_rougeL:.4f}, BERTScore-F1: {current_bertscore:.4f}")

        # 檢查是否保存模型或提前停止
        if val_loss < best_val_loss or current_rouge1 > best_rouge1:
            best_val_loss = val_loss
            best_rouge1 = current_rouge1
            patience_counter = 0
            model.save_pretrained(os.path.join(checkpoint_dir, "best_model"))
            print("Model saved.")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping.")
                break

In [None]:
import json
import torch
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
from peft import PeftModel
from tqdm import tqdm
import numpy as np

# ---------- 前處理 ----------
def clean_text(text):
    text = re.sub(r'\\[a-zA-Z]+\{.*?\}', '', text)
    text = re.sub(r'\$\$.*?\$\$', '', text)
    text = re.sub(r'\$.*?\$', '', text)
    text = re.sub(r'\[\d+(,\s*\d+)*\]', '', text)
    text = re.sub(r'(Fig\.|Figure)\s*\d+.*?(?=\.|$)', '', text)
    text = re.sub(r'Table\s*\d+.*?(?=\.|$)', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,;:!?()\-]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# ---------- Reranking 分數 ----------
def heuristic_score(summary, intro):
    words = summary.split()
    sents = summary.count('.') + summary.count('?') + summary.count('!')
    intro_words = set(intro.lower().split())
    summary_words = set(summary.lower().split())
    overlap = len(intro_words & summary_words)
    return min(len(words), 400) / 400 + min(sents, 8) / 8 + min(overlap, 30) / 30

# ---------- 載入模型 ----------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "google/gemma-7b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)
model = PeftModel.from_pretrained(base_model, "gemma-7b_checkpoints/best_model")
model.eval()

# ---------- Generation 設定 ----------
generation_config = GenerationConfig(
    max_new_tokens=900,
    min_length=200,
    num_beams=5,
    length_penalty=1.0,
    repetition_penalty=1.1,
    no_repeat_ngram_size=2,
    early_stopping=True,
    eos_token_id=tokenizer.eos_token_id
)

# ---------- 載入測試資料 ----------
with open("test.json", "r") as f:
    test_data = [json.loads(line) for line in f]

# ---------- 生成摘要 + Self-Reranking ----------
results = []
for item in tqdm(test_data):
    paper_id = item["paper_id"]
    intro = clean_text(item["introduction"])  # 清理介紹文本

    # 修改提示詞，明確分隔符號
    prompt = f"""You are an AI expert research assistant.
Your job is to read the following introduction from a scientific paper in the field of computer science or AI and generate a structured, informative abstract.

Focus on the following:
1. Research background and motivation.
2. Methodology or proposed solution.
3. Key results or contributions.

Use academic language and maintain clarity. The abstract should be suitable for publication.

Introduction: {intro}

Generate the abstract below, starting with [ABSTRACT] followed by the content:
"""

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
    candidates = []
    for _ in range(1):  # 產生多個摘要候選
        with torch.no_grad():
            gen_ids = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                generation_config=generation_config,
                do_sample=True,
                top_p=0.9,
                temperature=0.7
            )
        decoded = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
        # 提取 [ABSTRACT] 後的內容，並移除換行符號
        try:
            summary = decoded.split("[ABSTRACT]")[1].strip().replace("\n", " ")
        except IndexError:
            # 如果模型未生成 [ABSTRACT]，則取最後一段作為摘要
            summary = decoded.split("Generate the abstract below")[-1].strip().replace("\n", " ")
        candidates.append(summary)

    # reranking
    scores = [heuristic_score(c, intro) for c in candidates]
    best_summary = candidates[np.argmax(scores)]

    results.append({"paper_id": str(paper_id), "abstract": best_summary})

# ---------- 儲存結果 ----------
with open("submission_gemma.json", "w") as f:
    for item in results:
        json.dump(item, f)
        f.write("\n")

print("✅ 推理與重排序完成，已儲存為 submission_gemma.json")

先計算token數量

In [None]:
from transformers import AutoTokenizer
import json
import statistics

# 載入 tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-12b-it", trust_remote_code=True)

# 計算 token 數量的函式
def count_tokens(text):
    return len(tokenizer.encode(text))

# 讀取資料
with open("train.json", "r") as f:
    train_data = [json.loads(line) for line in f]
with open("test.json", "r") as f:
    test_data = [json.loads(line) for line in f]

# 儲存訓練資料的 token 數量
train_intro_tokens = []
train_abstract_tokens = []

# 計算並儲存訓練資料的 token 數量
for paper in train_data:
    intro = paper["introduction"]
    abstr = paper["abstract"]
    train_intro_tokens.append(count_tokens(intro))
    train_abstract_tokens.append(count_tokens(abstr))

# 儲存測試資料的 token 數量
test_intro_tokens = []

# 計算並儲存測試資料的 token 數量
for paper in test_data:
    intro = paper["introduction"]
    test_intro_tokens.append(count_tokens(intro))

# 計算訓練資料的統計數據
print("訓練資料 (Introduction):")
print(f"  平均 token 數: {statistics.mean(train_intro_tokens):.2f}")
print(f"  中位數 token 數: {statistics.median(train_intro_tokens)}")
print(f"  最小 token 數: {min(train_intro_tokens)}")
print(f"  最大 token 數: {max(train_intro_tokens)}")
print(f"  標準差: {statistics.stdev(train_intro_tokens):.2f}")
print()

print("訓練資料 (Abstract):")
print(f"  平均 token 數: {statistics.mean(train_abstract_tokens):.2f}")
print(f"  中位數 token 數: {statistics.median(train_abstract_tokens)}")
print(f"  最小 token 數: {min(train_abstract_tokens)}")
print(f"  最大 token 數: {max(train_abstract_tokens)}")
print(f"  標準差: {statistics.stdev(train_abstract_tokens):.2f}")
print()

# 計算測試資料的統計數據
print("測試資料 (Introduction):")
print(f"  平均 token 數: {statistics.mean(test_intro_tokens):.2f}")
print(f"  中位數 token 數: {statistics.median(test_intro_tokens)}")
print(f"  最小 token 數: {min(test_intro_tokens)}")
print(f"  最大 token 數: {max(test_intro_tokens)}")
print(f"  標準差: {statistics.stdev(test_intro_tokens):.2f}")

# Qwen

In [None]:
pip install torch transformers datasets rouge_score scikit-learn numpy tiktoken

In [None]:
pip install einops transformers_stream_generator

訓練

In [None]:
import json
import random
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler, BitsAndBytesConfig

from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

import spacy
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

import os
os.makedirs("qwen_checkpoints/best_model", exist_ok=True)
os.makedirs("qwen_checkpoints/final_model", exist_ok=True)

nlp = spacy.load("en_core_web_sm")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


def extract_important_sentences(text, tokenizer, max_token_length=4096):
    # 用 spaCy 分句
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    if len(sentences) <= 1:
        return text

    # 用 TF-IDF 評估重要性
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    scores = tfidf_matrix.sum(axis=1).A1
    ranked_sentences = [sent for _, sent in sorted(zip(scores, sentences), reverse=True)]

    selected = []
    total_tokens = 0
    for sent in ranked_sentences:
        tokenized = tokenizer(sent, add_special_tokens=False)["input_ids"]
        if total_tokens + len(tokenized) > max_token_length:
            break
        selected.append(sent)
        total_tokens += len(tokenized)

    return " ".join(selected)


# 1. 資料處理
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=4096):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        intro = self.data[idx]["introduction"]
        abstract = self.data[idx]["abstract"]

        # 如果太長就用 extract_important_sentences
        intro = extract_important_sentences(intro, self.tokenizer, max_token_length=self.max_length - 512)

        prompt = (
            "You are a Computer Science research assistant. Summarize the following introduction into a clear and concise academic abstract.\n\n"
            f"Introduction: {intro}\n\n"
            "Abstract:"
        )
        full_text = f"{prompt} {abstract}"

        tokenized = self.tokenizer(full_text, max_length=self.max_length, truncation=True, padding="max_length", return_tensors="pt")
        input_ids = tokenized["input_ids"].squeeze()
        attention_mask = tokenized["attention_mask"].squeeze()

        labels = input_ids.clone()
        prompt_len = len(self.tokenizer(prompt, truncation=True, max_length=self.max_length)["input_ids"])
        labels[:prompt_len] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels
        }

data = []
with open("train.json", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():  
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON line: {e}")
                continue

train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)
print(f"Training samples: {len(train_data)}, Validation samples: {len(val_data)}")

# 2. 加載 Qwen 模型與 Tokenizer
model_name = "Qwen/Qwen2-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# 設置資料集與 DataLoader
train_dataset = CustomDataset(train_data, tokenizer, max_length=4096)
val_dataset = CustomDataset(val_data, tokenizer, max_length=4096)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

# 3. 訓練設置
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
gradient_accumulation_steps = 4
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * num_epochs // gradient_accumulation_steps)

patience = 5
early_stopping_counter = 0

# ROUGE 評估函數
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
def compute_rouge(predictions, references):
    scores = {"rouge1": [], "rouge2": [], "rougeL": []}
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores["rouge1"].append(score["rouge1"].fmeasure)
        scores["rouge2"].append(score["rouge2"].fmeasure)
        scores["rougeL"].append(score["rougeL"].fmeasure)
    return {k: np.mean(v) for k, v in scores.items()}

def compute_bestscore1(rouge_scores):
    return rouge_scores["rouge1"]

# 4. 訓練與驗證循環
best_rouge = 0.0
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch+1}", leave=False)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / gradient_accumulation_steps
        total_loss += loss.item() * gradient_accumulation_steps

        loss.backward()
        if (i + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

    if (epoch + 1) % 5 == 0:
        model.eval()
        predictions, references = [], []
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validating"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                outputs = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=550,
                    num_beams=8,
                    do_sample=False,
                    top_k=30,
                    top_p=0.95,
                    temperature=0.7,
                    no_repeat_ngram_size=3,
                    early_stopping=True
                )
                pred_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
                ref_texts = [tokenizer.decode(label, skip_special_tokens=True) for label in batch["labels"]]
                pred_texts = [text.split("Abstract:")[-1].strip() if "Abstract:" in text else text for text in pred_texts]
                predictions.extend(pred_texts)
                references.extend(ref_texts)

        rouge_scores = compute_rouge(predictions, references)
        bestscore1 = compute_bestscore1(rouge_scores)
        print(f"Validation - ROUGE-1: {rouge_scores['rouge1']:.4f}, ROUGE-2: {rouge_scores['rouge2']:.4f}, ROUGE-L: {rouge_scores['rougeL']:.4f}, BestScore-1: {bestscore1:.4f}")

        # Early Stopping 判斷
        current_score = rouge_scores["rouge1"]
        if current_score > best_rouge:
            best_rouge = current_score
            early_stopping_counter = 0  # reset counter
            print("New best ROUGE-1, saving model...")
            model.save_pretrained("qwen_checkpoints/best_model")
            tokenizer.save_pretrained("qwen_checkpoints/best_model")
        else:
            early_stopping_counter += 1
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.5
            print(f"ROUGE-1 not improved, reducing LR to {optimizer.param_groups[0]['lr']:.6f}")

        if early_stopping_counter >= patience:
            print(f"Early stopping triggered after {patience} validations without improvement.")
            break

# 5. 保存最終模型
model.save_pretrained("qwen_checkpoints/final_model")
tokenizer.save_pretrained("qwen_checkpoints/final_model")
print("Training completed!")

推理

In [None]:
import json
import torch
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from nltk.tokenize import sent_tokenize
from peft import PeftModel
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import unicodedata

nlp = spacy.load("en_core_web_sm")

# === 資料清洗 ===
allowed_unicode = "∑∂∇∞θπ𝒟𝒫𝒩αβγδελμσφωℝ𝔽𝓛"
def is_allowed_char(c):
    return (
        ord(c) < 128 or
        c in allowed_unicode or
        "MATHEMATICAL" in unicodedata.name(c, "")
    )

def clean_intro(text):
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'\\[a-zA-Z]+\{([^}]*)\}', r'\1', text)  # 保留 \emph{} 內文
    text = ''.join(c if is_allowed_char(c) else ' ' for c in text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def extract_important_sentences(text, tokenizer, max_token_length=4096, keep_head_sentences=3):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    if len(sentences) <= keep_head_sentences:
        return text

    selected = sentences[:keep_head_sentences]
    total_tokens = sum(len(tokenizer(s, add_special_tokens=False)["input_ids"]) for s in selected)

    try:
        remaining_sentences = sentences[keep_head_sentences:]
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(remaining_sentences)
        scores = tfidf_matrix.sum(axis=1).A1
        ranked = [s for _, s in sorted(zip(scores, remaining_sentences), reverse=True)]
    except Exception as e:
        print(f"[TF-IDF fallback] {e}")
        ranked = remaining_sentences[:5]

    for sent in ranked:
        token_len = len(tokenizer(sent, add_special_tokens=False)["input_ids"])
        if total_tokens + token_len > max_token_length:
            break
        selected.append(sent)
        total_tokens += token_len

    return " ".join(selected)

def maybe_extract_important_sentences(text, tokenizer, max_token_threshold=3000, keep_head_sentences=3):
    tokens = tokenizer(text, add_special_tokens=False)["input_ids"]
    if len(tokens) <= max_token_threshold:
        return text
    return extract_important_sentences(text, tokenizer, max_token_length=max_token_threshold, keep_head_sentences=keep_head_sentences)

def clean_generated_abstract(text):
    doc = nlp(text)
    seen = set()
    cleaned = []
    for sent in doc.sents:
        s_strip = sent.text.strip()
        if s_strip and s_strip not in seen:
            cleaned.append(s_strip)
            seen.add(s_strip)
    return " ".join(cleaned)

# === 設定 ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_input_length = 4096
max_output_tokens = 650
model_dir = "qwen_checkpoints/epoch_6"

# === 載入 tokenizer 與模型 ===
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B-Instruct",
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model = PeftModel.from_pretrained(base_model, model_dir)
model.eval()

# === 載入資料 ===
with open("test.json", "r", encoding="utf-8") as f:
    test_data = [json.loads(line.strip()) for line in f if line.strip()]

# === 推理 ===
results = []
for item in tqdm(test_data, desc="Generating Abstracts"):
    paper_id = item["paper_id"]
    intro = clean_intro(item["introduction"])
    intro = maybe_extract_important_sentences(intro, tokenizer)

    prompt = (
        "You are an expert research assistant specialized in artificial intelligence. "
        "Your task is to read the following paper introduction and write a clear, formal, and concise abstract. "
        "Focus on the research background, motivation, methods, and key contributions.\n\n"
        f"Introduction:\n{intro}\n\n"
        "Abstract:"
    )

    tokenized = tokenizer(prompt, return_tensors="pt", max_length=4096, truncation=True, padding=True).to(device)
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]

    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_output_tokens,
        attention_mask=attention_mask,
        num_beams=8,
        do_sample=False,
        temperature=0.7,
        top_k=30,
        top_p=0.95,
        early_stopping=True
    )

    abstract = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if "Abstract:" in abstract:
        abstract = abstract.split("Abstract:")[-1].strip()
    else:
        abstract = abstract.strip().split("\n")[-1]

    abstract = clean_generated_abstract(abstract)
    results.append({"paper_id": str(paper_id), "abstract": abstract})

# === 輸出結果 ===
with open("submission_qwen.json", "w", encoding="utf-8") as f:
    for item in results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("摘要已儲存到 submission_qwen.json")