In [63]:
# pip install transformers[torch]
# pip install datasets
# Hugging Face提供的开源大模型：https://huggingface.co/models
# 中文GPT2模型：https://huggingface.co/uer/gpt2-chinese-cluecorpussmall
# Hugging Face提供的transformers课程: https://huggingface.co/learn/llm-course/chapter0/1
import torch
import evaluate
import accelerate
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [67]:
# 情感分析
classifier = pipeline("sentiment-analysis")
classifier(
    ["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"]
)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [69]:
# 文本生成
classifier = pipeline("text-generation")
classifier("好久没来了，想不到")

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "好久没来了，想不到啊。\n\nThe whole thing is a mess, but the results are great. The only problem is that they have been made up in a very short time, and yet they still hold the impression that the government is not serious about the reforms. But the point of the whole thing is that the reform has not yet reached any serious stage.\n\nWhat about the reforms that have already been made?\n\nThe government is still planning to introduce them, but in some cases it can't even make them work. So they are still looking at other things.\n\nThere is a big question about how many people will join the reforms, or whether they will even be able to join them.\n\nWe can only point out one thing. If they decide to take other steps, they will go into a different space, but the government will not just go into the new space, but will also put in the same efforts. As soon as the government starts to talk about the reforms, it will have a lot of problems to solve. So it's much too early to say 

In [70]:
# 问答
classifier = pipeline("question-answering")
classifier(
    question="Where do I work?",
    context="My name is Sylvain and I work at Hugging Face in Brooklyn",
)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


{'score': 0.6949769854545593, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}

In [72]:
classifier = pipeline("summarization")
classifier(
    """
    America has changed dramatically during recent years. Not only has the number of 
    graduates in traditional engineering disciplines such as mechanical, civil, 
    electrical, chemical, and aeronautical engineering declined, but in most of 
    the premier American universities engineering curricula now concentrate on 
    and encourage largely the study of engineering science. As a result, there 
    are declining offerings in engineering subjects dealing with infrastructure, 
    the environment, and related issues, and greater concentration on high 
    technology subjects, largely supporting increasingly complex scientific 
    developments. While the latter is important, it should not be at the expense 
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other 
    industrial countries in Europe and Asia, continue to encourage and advance 
    the teaching of engineering. Both China and India, respectively, graduate 
    six and eight times as many traditional engineers as does the United States. 
    Other industrial countries at minimum maintain their output, while America 
    suffers an increasingly serious decline in the number of engineering graduates 
    and a lack of well-educated engineers.
"""
)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Device set to use cpu


[{'summary_text': ' America has changed dramatically during recent years . The number of engineering graduates in the U.S. has declined in traditional engineering disciplines such as mechanical, civil,    electrical, chemical, and aeronautical engineering . Rapidly developing economies such as China and India continue to encourage and advance the teaching of engineering .'}]

In [73]:
# 其他pipeline:
# feature-extraction特征提取：把一段文字用一个向量来表示
# fill-mask填词：把一段文字的某些部分mask住，然后让模型填空
# ner命名实体识别：识别文字中出现的人名地名的命名实体
# translation翻译：把一种语言的文字翻译成另一种语言

In [74]:
# 加载CLUE benchmark中的tnews数据集
dataset = load_dataset("clue", "tnews")

In [75]:
# 该数据集来自今日头条的新闻版块，共提取了15个类别的新闻，包括旅游，教育，金融，军事等
# 数据量：训练集(53,360)，验证集(10,000)，测试集(10,000)
dataset["train"][0]

{'sentence': '上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？',
 'label': 7,
 'idx': 0}

In [76]:
# 预处理成模型可接受格式
data = [
    {"text": item["sentence"], "label": item["label"]} for item in dataset["train"]
]

In [48]:
# 分训练与验证集
train_data, val_data = train_test_split(data, test_size=0.2)

In [77]:
# 加载中文GPT2分词器
tokenizer = AutoTokenizer.from_pretrained("uer/gpt2-chinese-cluecorpussmall")

In [132]:
# 分词示例
print(tokenizer(data[0]["text"]))
tokens = tokenizer.tokenize(data[0]["text"])
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
decoded_string = tokenizer.decode([2514, 3369, 4253, 1200, 2443, 1110, 3014])
print(decoded_string)

{'input_ids': [101, 677, 6440, 3198, 2110, 4495, 2797, 3322, 1510, 702, 679, 977, 8024, 5439, 2360, 671, 2584, 722, 678, 2828, 2797, 3322, 3035, 749, 8024, 2157, 7270, 2897, 1355, 4873, 6375, 5439, 2360, 6608, 8024, 1920, 2157, 2582, 720, 4692, 2521, 6821, 4905, 752, 8043, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['上', '课', '时', '学', '生', '手', '机', '响', '个', '不', '停', '，', '老', '师', '一', '怒', '之', '下', '把', '手', '机', '摔', '了', '，', '家', '长', '拿', '发', '票', '让', '老', '师', '赔', '，', '大', '家', '怎', '么', '看', '待', '这', '种', '事', '？']
[677, 6440, 3198, 2110, 4495, 2797, 3322, 1510, 702, 679, 977, 8024, 5439, 2360, 671, 2584, 722, 678, 2828, 2797, 3322, 3035, 749, 8024, 2157, 7270, 2897, 1355, 4873, 6375, 5439, 2360

In [78]:
# 自定义数据集类
class ChinesePairDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        # return_tensors="pt"用于指定返回PyTorch格式的张量
        inputs = self.tokenizer(text, padding="max_length", truncation=True,
                                max_length=self.max_length, return_tensors="pt")
        # 构建了一个新的字典，保留所有 key（比如 input_ids, attention_mask 等），但将所有值都从二维压成一维
        # tokenizer(text, return_tensors="pt")默认生成batch size=1的二维张量（[1, seq_len]）
        # 但在__getitem__中，每个样本通常是一维的张量（[seq_len]），由DataLoader自动再堆成 [batch_size, seq_len]
        # 所以我们用 squeeze(0)去掉batch维度，保持每条样本的输入结构正确
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = torch.tensor(self.data[idx]["label"])
        return inputs

In [79]:
# 构建 Dataset实例
train_dataset = ChinesePairDataset(train_data, tokenizer)
val_dataset = ChinesePairDataset(val_data, tokenizer)

In [80]:
# 加载中文GPT2模型并设置为15分类
model = AutoModelForSequenceClassification.from_pretrained("uer/gpt2-chinese-cluecorpussmall", num_labels=15)
# 将tokenizer中设置的pad_tokenID同步到模型的配置中，以便模型在训练或推理时知道该使用哪个token表示padding（填充）的位置
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at uer/gpt2-chinese-cluecorpussmall and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
# 模型结构
print(model)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(21128, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=15, bias=False)
)


In [82]:
# 模型参数
print(model.state_dict())

OrderedDict({'transformer.wte.weight': tensor([[-0.0172,  0.0128, -0.0813,  ..., -0.0712, -0.0818,  0.0191],
        [-0.0280, -0.0265, -0.0598,  ..., -0.0218, -0.0455,  0.0115],
        [-0.0042, -0.0209, -0.0773,  ..., -0.0051, -0.0600,  0.0239],
        ...,
        [-0.0356, -0.0217, -0.0358,  ..., -0.0810, -0.0605,  0.0245],
        [-0.0339,  0.0234, -0.0510,  ..., -0.0368, -0.0434, -0.0210],
        [-0.0182,  0.0087, -0.0576,  ..., -0.0460, -0.0876,  0.0202]]), 'transformer.wpe.weight': tensor([[-0.0318, -0.0247,  0.0271,  ..., -0.0044,  0.0469, -0.0052],
        [ 0.0132, -0.0161, -0.0487,  ..., -0.0017,  0.0070, -0.0112],
        [ 0.0275, -0.0199, -0.0097,  ...,  0.0034,  0.0226, -0.0014],
        ...,
        [ 0.0086, -0.0161, -0.0113,  ..., -0.0462, -0.0149,  0.0419],
        [ 0.0062, -0.0117,  0.0067,  ...,  0.0183, -0.0375, -0.0020],
        [ 0.0119, -0.0192,  0.0028,  ..., -0.0126, -0.0504, -0.0137]]), 'transformer.h.0.ln_1.weight': tensor([0.2965, 0.3110, 0.3093, 0.

In [136]:
# 第一篇文章在模型微调前的类别预测概率
input0 = tokenizer.tokenize(data[0]["text"])
print(input0)
ids0 = torch.tensor([tokenizer.convert_tokens_to_ids(input0)])
print(ids0)
output0 = model(ids0)
print("Logits:", output0.logits)
predictions = torch.nn.functional.softmax(output0.logits, dim=-1)
print(predictions)

['上', '课', '时', '学', '生', '手', '机', '响', '个', '不', '停', '，', '老', '师', '一', '怒', '之', '下', '把', '手', '机', '摔', '了', '，', '家', '长', '拿', '发', '票', '让', '老', '师', '赔', '，', '大', '家', '怎', '么', '看', '待', '这', '种', '事', '？']
tensor([[ 677, 6440, 3198, 2110, 4495, 2797, 3322, 1510,  702,  679,  977, 8024,
         5439, 2360,  671, 2584,  722,  678, 2828, 2797, 3322, 3035,  749, 8024,
         2157, 7270, 2897, 1355, 4873, 6375, 5439, 2360, 6608, 8024, 1920, 2157,
         2582,  720, 4692, 2521, 6821, 4905,  752, 8043]])
Logits: tensor([[-0.0813, -0.2289, -0.4375,  0.3203, -0.2410,  0.1756, -0.5549, -0.1747,
         -0.1870,  0.0860,  0.4876, -0.2719, -0.6054, -0.5370,  0.2106]],
       grad_fn=<IndexBackward0>)
tensor([[0.0668, 0.0576, 0.0468, 0.0998, 0.0569, 0.0863, 0.0416, 0.0608, 0.0601,
         0.0789, 0.1179, 0.0552, 0.0395, 0.0423, 0.0894]],
       grad_fn=<SoftmaxBackward0>)


In [115]:
# 设置训练参数
training_args = TrainingArguments(
    # 训练过程中模型权重、配置文件、日志等输出保存的位置
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

In [116]:
# 准备评估指标
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [117]:
# 创建Trainer实例
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [118]:
# 开始训练
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [62]:
# 最终评估
results = trainer.evaluate()
print("验证集准确率:", results["eval_accuracy"])

验证集准确率: 0.13746251874062967


In [None]:
# 保存模型
model.save_pretrained(r"G:/GPT")