In [1]:
import pandas as pd
from datasets import Dataset
import unicodedata
import os
import sys
import re

def clean_text_for_inference(text):
    def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
    text = unicode_to_ascii(text.lower().strip())
    # convert ... and .. to <title_end> and <p>
    # however, we can not use "<>" since it will be removed by tokenizer in default

    text = re.sub(r"\.\.\.", " ", text)
    text = re.sub(r".*?\.\.", "", text)
    
    # Abbreviation Restoration & Stem Preservation
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)

    # Multiple spaces merge
    text = re.sub(r"\s+", " ", text).strip()
    return text

loadDataSize = 500
df = pd.read_csv("../data/bbc-news-summary.csv").dropna()
print(df.columns)
per_class_count = int(loadDataSize / 5)
selected_dfs = []

for category in df['File_path'].unique():
	category_df = df[df['File_path'] == category].head(per_class_count)
	selected_dfs.append(category_df)

selected_df = pd.concat(selected_dfs).sample(frac=1).reset_index(drop=True)  # shuffle the selected data

# 清理文本并添加 <sos> 和 <eos>
selected_df['input_text'] = selected_df['Articles'].apply(clean_text_for_inference)
selected_df['target_text'] = selected_df['Summaries'].apply(clean_text_for_inference)


dataset = Dataset.from_pandas(selected_df)


  from .autonotebook import tqdm as notebook_tqdm


Index(['File_path', 'Articles', 'Summaries'], dtype='object')


In [10]:
selected_df.head()

Unnamed: 0,File_path,Articles,Summaries,input_text,target_text
0,politics,Kilroy names election seat target..Ex-chat sho...,"UKIP's leader, Roger Knapman, has said he is g...",ex-chat show host robert kilroy-silk is to con...,"ukip's leader, roger knapman, has said he is g..."
1,business,Mitsubishi in Peugeot link talks..Trouble-hit ...,Trouble-hit Mitsubishi Motors is in talks with...,trouble-hit mitsubishi motors is in talks with...,trouble-hit mitsubishi motors is in talks with...
2,entertainment,Court halts Mark Morrison album..Premiership f...,Premiership footballer and record company boss...,but morrison is determined the album will be r...,premiership footballer and record company boss...
3,entertainment,US 'to raise TV indecency fines'..US politicia...,Last year's Janet Jackson 'wardrobe malfunctio...,us politicians are proposing a tough new law a...,last year's janet jackson 'wardrobe malfunctio...
4,business,Mild winter drives US oil down 6%..US oil pric...,"US oil prices have fallen by 6%, driven down b...","us oil prices have fallen by 6%, driven down b...","us oil prices have fallen by 6%, driven down b..."


In [12]:
print(selected_df.iloc[0]["input_text"])
print(selected_df.iloc[0]["target_text"])

ex-chat show host robert kilroy-silk is to contest the derbyshire seat of erewash at the next general election labour's elizabeth blackman won the seat in 1997 and has a 6,932 majority. she says she will fight on her record "as a hard-working constituency mp". mr kilroy-silk announced his plans a day after launching his new party, veritas, the latin for truth. the east midlands mep, who quit the uk independence party, wants his new group to "change the face" of uk politics. his choice of election constituency quashes speculation that he would stand against defence secretary geoff hoon in ashfield, nottinghamshire. ukip won 31% of the vote in erewash in last june's european elections - with mr kilroy-silk among their candidates for the region. until 1997, erewash had been held by the tories since 1970. ms blackman said she was proud of the government's achievements in the area. she declined to give her view of mr kilroy-silk at this point on thursday, he told a london news conference th

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# 加载模型和分词器
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# 设置设备（使用 GPU 加速）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [3]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["input_text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=160, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# 对数据集进行预处理
tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map: 100%|██████████| 500/500 [00:00<00:00, 565.13 examples/s]


In [4]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",   # 确保评估和保存策略一致
    save_strategy="epoch",         # 设置为 steps
	logging_steps=100,
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=45,
    weight_decay=0.01,
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True
)


In [5]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# 开始训练
trainer.train()


  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,1.784433
2,No log,1.433337
3,2.229100,1.258903
4,2.229100,1.168095
5,1.400700,1.11773
6,1.400700,1.090694
7,1.400700,1.064648
8,1.262100,1.047797
9,1.262100,1.031112
10,1.200700,1.017734


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1890, training_loss=1.1596119552693038, metrics={'train_runtime': 705.8576, 'train_samples_per_second': 31.876, 'train_steps_per_second': 2.678, 'total_flos': 3041905322360832.0, 'train_loss': 1.1596119552693038, 'epoch': 45.0})

In [6]:
# 保存微调后的模型
model.save_pretrained("fine-tuned-t5-small-news-summarization")
tokenizer.save_pretrained("fine-tuned-t5-small-news-summarization")


('fine-tuned-t5-small-news-summarization\\tokenizer_config.json',
 'fine-tuned-t5-small-news-summarization\\special_tokens_map.json',
 'fine-tuned-t5-small-news-summarization\\spiece.model',
 'fine-tuned-t5-small-news-summarization\\added_tokens.json')

In [8]:
# 加载微调后的模型
from transformers import pipeline

test_input = """US ready to sign Ukraine minerals deal ‘this afternoon’, as Kyiv sends minister to Washington title_end The latest line from US Treasury Secretary Scott Bessent is that the US is ready to sign the deal if Ukraine is - but let's take a moment to look back at today's developments. p A senior source earlier this afternoon told the BBC that Ukraine was ready to sign the deal today and that economy minister Yulia Svyrdenko was en route to Washington. p Ukraine's Prime Minister Denys Shmyhal then said that the fine details were being worked on and he hoped it would be signed in the next 24 hours. p In Washington, the first we heard from the administration was at the end of a cabinet meeting marking the first 100 days of Trump's second term. Bessent responded to a question from the press and indicated that the US was ready to finalise the agreement after some "last minute changes".

"""

summarizer = pipeline("summarization", model="fine-tuned-t5-small-news-summarization")

summary = summarizer(test_input, max_length=100, min_length=30, do_sample=False)
print(summary)


Device set to use cuda:0


[{'summary_text': "p A senior source earlier this afternoon told the BBC that Ukraine was ready to sign the deal today and that economy minister Yulia Svyrdenko was en route to Washington.p Ukraine's Prime Minister Denys Shmyhal then said that the fine details were being worked on and he hoped it would be signed in the next 24 hours.p In Washington, the first we heard from the administration was at the end of a cabinet meeting marking"}]
