In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyarrow>=21.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, evaluate
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency confl

# 初始化学生模型

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DebertaV2Config

def create_student_from_teacher(teacher_model):
    # 创建学生配置
    teacher_config = teacher_model.config
    student_config = DebertaV2Config.from_dict(teacher_config.to_dict())
    student_config.num_hidden_layers = teacher_config.num_hidden_layers // 2

    # 创建学生模型
    student_model = type(teacher_model)(student_config)

    # 复制 embedding 层
    student_model.deberta.embeddings.load_state_dict(
        teacher_model.deberta.embeddings.state_dict()
    )

    # 复制 pooler
    if (
        hasattr(teacher_model.deberta, "pooler")
        and teacher_model.deberta.pooler is not None
        and student_model.deberta.pooler is not None
    ):
        student_model.deberta.pooler.load_state_dict(teacher_model.deberta.pooler.state_dict())

    # 复制 encoder 的前 N 层
    num_student_layers = student_config.num_hidden_layers
    for i in range(num_student_layers):
        student_model.deberta.encoder.layer[i].load_state_dict(
            teacher_model.deberta.encoder.layer[i].state_dict()
        )

    # 复制分类头
    student_model.classifier.load_state_dict(teacher_model.classifier.state_dict())

    return student_model

# 使用
teacher = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/deberta-v3-base-teacher")
student = create_student_from_teacher(teacher)

# 保存
output_dir = "deberta-v3-base-student"
student.save_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/deberta-v3-base-teacher")
tokenizer.save_pretrained(output_dir)

2025-11-26 09:24:17.894152: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764149058.048838      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764149058.092153      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

('deberta-v3-base-student/tokenizer_config.json',
 'deberta-v3-base-student/special_tokens_map.json',
 'deberta-v3-base-student/spm.model',
 'deberta-v3-base-student/added_tokens.json',
 'deberta-v3-base-student/tokenizer.json')

# 加载数据

In [3]:
import pandas as pd
import datasets
from sklearn.model_selection import train_test_split
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/deberta-v3-base-teacher")
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

# 加载数据
train = pd.read_csv("/kaggle/input/corpus-imdb/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("/kaggle/input/corpus-imdb/testData.tsv", header=0, delimiter="\t", quoting=3)
train, val = train_test_split(train, test_size=.2)
    
train_dict = {'labels': train["sentiment"], 'text': train['review']}
val_dict = {'labels': val["sentiment"], 'text': val['review']}
test_dict = {"text": test['review']}

train_dataset = datasets.Dataset.from_dict(train_dict)
val_dataset = datasets.Dataset.from_dict(val_dict)
test_dataset = datasets.Dataset.from_dict(test_dict)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

# 蒸馏训练

In [4]:
from transformers import Trainer, TrainingArguments

class DistillationTrainingArguments(TrainingArguments): 
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs): 
        super().__init__(*args, **kwargs) 
        self.alpha = alpha 
        self.temperature = temperature 
        
class DistillationTrainer(Trainer): 
    def __init__(self, *args, teacher_model=None, **kwargs): 
        super().__init__(*args, **kwargs) 
        self.teacher = teacher_model 
        # place teacher on same device as student
        self._move_model_to_device(self.teacher, self.model.device) 
        self.teacher.eval() 
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None) : 
        labels = inputs.pop("labels")
        # compute student output
        outputs_student = model(**inputs)

        # compute teacher output
        with torch.no_grad(): 
            outputs_teacher = self.teacher(**inputs) 
            
        # assert size
        assert outputs_student.logits.size() == outputs_teacher.logits.size()
        
        # 计算硬标签损失
        student_loss = F.cross_entropy(outputs_student.logits, labels)
        
        # Soften probabilities and compute distillation loss
        # 计算 KL 散度
        loss_function = nn.KLDivLoss(reduction="batchmean") 
        
        # 计算学生模型和教师网络数值输出的交叉熵损失
        loss_logits = (loss_function(
            # logits首先除以 temperature, 以增加对错误分类的关注
            F.log_softmax(outputs_student.logits / self.args.temperature, dim=-1), 
            F.softmax(outputs_teacher.logits / self.args.temperature, dim=-1)) * (self.args.temperature ** 2)) 
        
        # Return weighted student loss
        loss = self.args.alpha * student_loss + (1. - self.args.alpha) * loss_logits 
        return (loss, outputs_student) if return_outputs else loss


# 加载模型

In [5]:
student_id = "/kaggle/working/deberta-v3-base-student" 
teacher_id = "/kaggle/input/deberta-v3-base-teacher" 
    
# load model
teacher_model = AutoModelForSequenceClassification.from_pretrained( 
    teacher_id, 
) 
# define student model
student_model = AutoModelForSequenceClassification.from_pretrained( 
    student_id, 
) 

if hasattr(student_model, "gradient_checkpointing_disable"):
    student_model.gradient_checkpointing_disable()
else:
    student_model.config.gradient_checkpointing = False

# 模型训练

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import os
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

os.makedirs('/kaggle/working/checkpoint', exist_ok=True)
os.makedirs('/kaggle/working/logs', exist_ok=True)
training_args = DistillationTrainingArguments( 
    output_dir="/kaggle/working/checkpoint", 
    num_train_epochs=3, 
    per_device_train_batch_size=2, 
    per_device_eval_batch_size=4, 
    gradient_accumulation_steps=8, 
    gradient_checkpointing=False, 
    warmup_steps=500,  
    # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  
    # strength of weight decay
    logging_dir='/kaggle/working/logs',  
    # directory for storing logs
    logging_steps=100, 
    save_strategy="no", 
    eval_strategy="epoch", 
    # distilation parameters
    alpha=0.5, 
    temperature=4.0,
    report_to="none"
) 
     
trainer = DistillationTrainer( 
    model=student_model, 
    args=training_args, 
    teacher_model=teacher_model, 
    train_dataset=tokenized_train,  # training dataset
    eval_dataset=tokenized_val,  # evaluation dataset
    processing_class=tokenizer, 
    data_collator=data_collator, 
    compute_metrics=compute_metrics, 
) 
trainer.train() 

Downloading builder script: 0.00B [00:00, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,1.8586,1.695111,0.8724
2,1.3163,1.467028,0.891
3,0.9648,1.559935,0.8902


TrainOutput(global_step=1875, training_loss=1.7555746053059895, metrics={'train_runtime': 7814.2969, 'train_samples_per_second': 7.678, 'train_steps_per_second': 0.24, 'total_flos': 6591775296477504.0, 'train_loss': 1.7555746053059895, 'epoch': 3.0})

In [7]:
prediction_outputs = trainer.predict(tokenized_test) 
test_pred = np.argmax(prediction_outputs[0], axis=-1).flatten() 
print(test_pred) 
result_output = pd.DataFrame(data={"id": test["id"], "sentiment": test_pred}) 
result_output.to_csv("/kaggle/working/deberta_base_student.csv", index=False, quoting=3) 

[1 0 0 ... 0 1 0]
