In [1]:
!pip install evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e974d72af74f803bfe1c2f3c31daf297a13a970d9a3e3251c619c8269edc96b7
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.3 rouge_score-0.1.2


In [2]:

import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    get_linear_schedule_with_warmup,
)
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. 加载教师模型和学生模型
# load student teacher model and student model 
#teacher model is bart-large model fine tuned on CNN dataset, with good performance in summary
teacher_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)
# 教师模型只用于生成软标签，因此设为eval模式，不进行反向传播
teacher_model.eval()
# student model is distilled model from bart, also trained on CNN dataset.
# This project will distill this student model again on the Xsum dataset.
student_model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-6-6").to(device)

# 两个模型可以使用同一个分词器，也可以分别加载（此处为了方便，统一使用 BART 的分词器）
# they could share the tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/460M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:

# 2. 加载并预处理数据集
#load and preprocess the dataset
from datasets import load_dataset
# 加载 XSUM 的 100 个样本
xsum_dataset = load_dataset("EdinburghNLP/xsum")
# For saving the resources, random select 1000 samples from dataset as the trianset
dataset  = xsum_dataset["train"].shuffle(seed=42).select(range(1000))
# 定义预处理函数，将输入文本和摘要转为token id
#preprocess the dataset, to fit the train
def preprocess(example):
    # 输入为完整文章（字段名可能是 "document"），目标为摘要（字段名为 "summary"）
    inputs = tokenizer(example["document"], truncation=True, padding="max_length", max_length=1024)
    targets = tokenizer(example["summary"], truncation=True, padding="max_length", max_length=400)
    # 注意：生成任务需要 labels 字段
    labels = targets["input_ids"]
    labels = [ [t if t != tokenizer.pad_token_id else -100 for t in label] for label in labels]
    
    inputs["labels"] = labels
    return inputs
#get the trainset
tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset.column_names)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
import evaluate
from tqdm import tqdm
# 加载 XSum 测试集前 20 个样本
# use 100 samples on Xsum, before do the distillation, to evalute the original student model's performance on Xsum
test_dataset = load_dataset("EdinburghNLP/xsum", split="test[:100]")

# ROUGE 计算器
rouge = evaluate.load("rouge")

# 生成摘要并收集结果
generated_summaries = []
reference_summaries = []

for sample in tqdm(test_dataset):
    article = sample["document"]
    reference_summary = sample["summary"]

    # 编码输入
    inputs = tokenizer(
        article,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding="max_length"
    ).to(device)

    # 生成摘要
    summary_ids = student_model.generate(
        **inputs,
        max_length=400,
        num_beams=4,
        length_penalty=2.0,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    # 解码生成的摘要
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    generated_summaries.append(generated_summary)
    reference_summaries.append(reference_summary)

# 计算 ROUGE 分数
results = rouge.compute(
    predictions=generated_summaries,
    references=reference_summaries,
    use_stemmer=True
)

# 打印结果
#Show the rouge results

for key in results:
    print(f"{key}: {results[key]*100:.2f}")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

100%|██████████| 100/100 [01:03<00:00,  1.58it/s]


rouge1: 20.74
rouge2: 4.12
rougeL: 13.72
rougeLsum: 13.78


In [5]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# define the trainer to do the distillation
class DistillationTrainer(Seq2SeqTrainer):
    def __init__(self, teacher_model, *args, alpha=0.5, temperature=1.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.alpha = alpha
        self.temperature = temperature
        self.kl_loss = KLDivLoss(reduction="batchmean")

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):  # 添加 **kwargs 吸收额外参数
        # 学生模型前向传播
        # forward
        labels = inputs.pop("labels")
        outputs = model(**inputs, labels=labels)
        # the outputs.loss could be the term of fine tune directly on the trainset
        student_loss = outputs.loss
        student_logits = outputs.logits

        # 教师模型前向传播（无梯度）
        #get the output logits of the teacher model from the same sample as the soft label.
        with torch.no_grad():
            teacher_outputs = self.teacher(
                input_ids=inputs["input_ids"].to(self.teacher.device),
                attention_mask=inputs["attention_mask"].to(self.teacher.device),
                labels=labels.to(self.teacher.device)
            )
            teacher_logits = teacher_outputs.logits

        # 计算KL散度损失（带mask处理）
        #calculate the KL divergence as the term of distillation.
        student_log_probs = F.log_softmax(student_logits / self.temperature, dim=-1)
        teacher_probs = F.softmax(teacher_logits / self.temperature, dim=-1)
        
        # 创建有效token的mask（忽略标签为-100的位置）
        mask = (labels != -100).unsqueeze(-1)
        valid_token_count = mask.sum()  # 统计有效token数量
        
        # 应用mask并计算KL散度
        kl_loss = self.kl_loss(
            (student_log_probs * mask).view(-1, student_log_probs.size(-1)),
            (teacher_probs * mask).view(-1, teacher_probs.size(-1))
        ) * (self.temperature ** 2)  # 根据温度系数调整损失尺度

        # 组合损失
        #get the final loss function, use alpha to balance two terms.
        total_loss = self.alpha * student_loss + (1 - self.alpha) * kl_loss

        return (total_loss, outputs) if return_outputs else total_loss

In [6]:
from torch.nn import KLDivLoss
# 4. 设置训练参数
#set args
training_args = Seq2SeqTrainingArguments(
    output_dir="./distillation_results",
    per_device_train_batch_size=4,
    num_train_epochs=7,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    report_to="none"
)

# 5. 初始化并运行训练

trainer = DistillationTrainer(
    teacher_model=teacher_model,
    model=student_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    alpha=0.3,
    temperature=1.0
)

# 开始训练
trainer.train()



Step,Training Loss
10,1.0443
20,0.9053
30,0.9132
40,0.8082
50,0.855
60,0.828
70,0.7927
80,0.8374
90,0.8396
100,0.8474




TrainOutput(global_step=875, training_loss=0.40073562036241805, metrics={'train_runtime': 2759.4773, 'train_samples_per_second': 2.537, 'train_steps_per_second': 0.317, 'total_flos': 7584954187776000.0, 'train_loss': 0.40073562036241805, 'epoch': 7.0})

In [7]:
import evaluate
from tqdm import tqdm
# 加载 XSum 测试集前 20 个样本
# use same testset to test the performance after the distillation
test_dataset = load_dataset("EdinburghNLP/xsum", split="test[:100]")

# ROUGE 计算器
rouge = evaluate.load("rouge")

# 生成摘要并收集结果
generated_summaries = []
reference_summaries = []

for sample in tqdm(test_dataset):
    article = sample["document"]
    reference_summary = sample["summary"]

    # 编码输入
    inputs = tokenizer(
        article,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding="max_length"
    ).to(device)

    # 生成摘要
    summary_ids = student_model.generate(
        **inputs,
        max_length=400,
        num_beams=4,
        length_penalty=2.0,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    # 解码生成的摘要
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    generated_summaries.append(generated_summary)
    reference_summaries.append(reference_summary)

# 计算 ROUGE 分数
results = rouge.compute(
    predictions=generated_summaries,
    references=reference_summaries,
    use_stemmer=True
)

# 打印结果
#the rouge score imporved
for key in results:
    print(f"{key}: {results[key]*100:.2f}")

100%|██████████| 100/100 [01:06<00:00,  1.50it/s]


rouge1: 26.78
rouge2: 7.99
rougeL: 19.18
rougeLsum: 19.13


In [8]:
from huggingface_hub import interpreter_login

interpreter_login()

# Upload Model & Tokenizer
student_model.push_to_hub("NuppuCat/distillBart-6-6-1000xsum-8epoche")
tokenizer.push_to_hub("NuppuCat/distillBart-6-6-1000xsum-8epoche")


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



Enter your token (input will not be visible):  ········
Add token as git credential? (Y/n)  Y


Token has not been saved to git credential helper.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m


model.safetensors:   0%|          | 0.00/920M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NuppuCat/distillBart-6-6-1000xsum-8epoche/commit/7ad4161ccf92454659156c8c1eb128dd344158cb', commit_message='Upload tokenizer', commit_description='', oid='7ad4161ccf92454659156c8c1eb128dd344158cb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NuppuCat/distillBart-6-6-1000xsum-8epoche', endpoint='https://huggingface.co', repo_type='model', repo_id='NuppuCat/distillBart-6-6-1000xsum-8epoche'), pr_revision=None, pr_num=None)

In [9]:
# 测试文本（可以换成任何段落）
#test a text
text = """
Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions. The term AI is often applied to machines that exhibit traits associated with human intelligence, such as learning, problem-solving, and decision-making. With advances in machine learning, AI systems are becoming more capable of performing complex tasks such as recognizing speech, translating languages, and driving autonomous vehicles. However, the development of AI raises ethical concerns regarding privacy, job displacement, and the potential for misuse of technology.
"""

# 编码输入
inputs = tokenizer(
    article,
    return_tensors="pt",
    truncation=True,
    max_length=1024,
    padding="max_length"
).to(device)

# 生成摘要
summary_ids = student_model.generate(
    **inputs,
    max_length=400,
    num_beams=4,
    length_penalty=2.0,
    no_repeat_ngram_size=3,
    early_stopping=True
)

# 解码生成的摘要
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# 输出生成的摘要
print("Generated Summary:")
print(generated_summary)

Generated Summary:
A "loved" man who worked for a company that recovered cocaine has been jailed for six months after admitting a charge of conspiracy to supply the Class A drug. Omar Khan, 31, and his co-workers, Albert Dibra and Nazaquat Ali, have been remanded in custody.


In [10]:
#reload the distilled model from huggingface
m = AutoModelForSeq2SeqLM.from_pretrained("NuppuCat/distillBart-6-6-1000xsum-8epoche").to(device)

# 两个模型可以使用同一个分词器，也可以分别加载（此处为了方便，统一使用 BART 的分词器）
t = AutoTokenizer.from_pretrained("NuppuCat/distillBart-6-6-1000xsum-8epoche")

# 测试文本（可以换成任何段落）
text = """
Artificial intelligence (AI) refers to the simulation of human intelligence in machines that are programmed to think like humans and mimic their actions. The term AI is often applied to machines that exhibit traits associated with human intelligence, such as learning, problem-solving, and decision-making. With advances in machine learning, AI systems are becoming more capable of performing complex tasks such as recognizing speech, translating languages, and driving autonomous vehicles. However, the development of AI raises ethical concerns regarding privacy, job displacement, and the potential for misuse of technology.
"""

# 编码输入
inputs = t(
    text,
    return_tensors="pt",
    truncation=True,
    max_length=1024,
    padding="max_length"
).to(device)

# 生成摘要
summary_ids = m.generate(
    **inputs,
    max_length=400,
    num_beams=4,
    length_penalty=2.0,
    no_repeat_ngram_size=3,
    early_stopping=True
)

# 解码生成的摘要
generated_summary = t.decode(summary_ids[0], skip_special_tokens=True)

# 输出生成的摘要
print("Generated Summary:")
print(generated_summary)

config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/920M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/358 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]



Generated Summary:
The term AI is often applied to machines that exhibit traits associated with human intelligence, such as learning.


In [11]:
import evaluate
from tqdm import tqdm
# 加载 XSum 测试集前 20 个样本
#test again
test_dataset = load_dataset("EdinburghNLP/xsum", split="test[:100]")

# ROUGE 计算器
rouge = evaluate.load("rouge")

# 生成摘要并收集结果
generated_summaries = []
reference_summaries = []

for sample in tqdm(test_dataset):
    article = sample["document"]
    reference_summary = sample["summary"]

    # 编码输入
    inputs = t(
        article,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
        padding="max_length"
    ).to(device)

    # 生成摘要
    summary_ids = m.generate(
        **inputs,
        max_length=400,
        num_beams=4,
        length_penalty=2.0,
        no_repeat_ngram_size=3,
        early_stopping=True
    )

    # 解码生成的摘要
    generated_summary = t.decode(summary_ids[0], skip_special_tokens=True)

    generated_summaries.append(generated_summary)
    reference_summaries.append(reference_summary)

# 计算 ROUGE 分数
results = rouge.compute(
    predictions=generated_summaries,
    references=reference_summaries,
    use_stemmer=True
)

# 打印结果
#still good result
for key in results:
    print(f"{key}: {results[key]*100:.2f}")

100%|██████████| 100/100 [00:31<00:00,  3.22it/s]


rouge1: 32.47
rouge2: 10.74
rougeL: 25.20
rougeLsum: 25.15
