In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, PromptTuningConfig, PeftType

# 加载 Natural Questions 数据集 (子集)
dataset = load_dataset("natural_questions", split='train[:5%]')

# 加载 Qwen-0.5B 模型
model_name = "Qwen-0.5B"  # 替换为你的本地模型路径
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 将数据集进行Tokenization
def preprocess_function(examples):
    inputs = [q.strip() for q in examples['question']]
    targets = [a['text'][0].strip() for a in examples['answers']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


  from .autonotebook import tqdm as notebook_tqdm


ConnectionError: Couldn't reach 'natural_questions' on the Hub (ProxyError)

In [8]:
import json

# 读取JSON数据集并指定编码为utf-8
with open('./data/SQuAD1.1/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 拿出来看看结构
print(data[:2])

[{'id': 0, 'answer': 'Saint Bernadette Soubirous', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'context': 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.', 'p_phrase': ['Lourdes', 'France', '1858', 'the Virgin Mary', 'appeared'], 'n_phrase': ['Saint Bernadette Soubirous'], 'full answer': 'The Virgin Mary allegedly appeared in 1858 in Lourdes France to Saint Bernadette Soubirous.'}, {'id': 1, 'answer': 'a copper statue of Christ', 'question': 'What is in front of the Notre Dame Main Building?', 'context': 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".', 'p_phrase': ['the Main Building', 'Christ', 'front'], 'n_phrase': ['a copper statue', 'facing'], 'full answer': 'A copper statue of Christ is in front of the Notre Dame Main Building.'}]


In [1]:
from transformers import AutoModel, AutoTokenizer

# 使用本地路径加载模型和分词器
model_path = "./model/qwen0.5b"

# 使用AutoModel来加载Qwen模型
# AutoModel 是用于加载 Transformer 模型的通用接口，这里加载的是 Qwen-0.5B 模型
model = AutoModel.from_pretrained(model_path)

# 使用AutoTokenizer来加载分词器
# AutoTokenizer 是用于处理文本输入的工具，它负责将文本转换为模型可以处理的张量格式
tokenizer = AutoTokenizer.from_pretrained(model_path)

# 测试分词器
text = "我喜欢小狗"  # 输入的示例文本

# 将文本转换为模型输入格式（张量），包括词汇的编码和注意力掩码
inputs = tokenizer(text, return_tensors="pt")

# 使用模型进行推理，生成输出
# 将张量输入到模型，得到输出，包括 last_hidden_state 和 past_key_values
outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

# 输出模型的返回结果，包括隐藏状态和缓存的上下文信息
print(outputs)

  from .autonotebook import tqdm as notebook_tqdm


BaseModelOutputWithPast(last_hidden_state=tensor([[[ 5.9717,  1.8740,  4.2277,  ..., -0.1168, -1.7965, -0.2775],
         [-2.5296, -0.5739, -6.1173,  ..., -6.6078, -5.1615, -2.7894]]],
       grad_fn=<MulBackward0>), past_key_values=((tensor([[[[-1.0833e+00, -2.4329e-01, -2.5547e-01,  ..., -2.2154e+00,
            2.2616e+00, -4.3567e+00],
          [-1.2909e+00, -3.6945e-02, -1.7132e-01,  ..., -2.2704e+00,
            2.2627e+00, -4.3512e+00]],

         [[-3.1599e+01,  2.4552e+01,  1.4655e+01,  ..., -2.2617e+01,
           -1.2975e+01, -1.4119e+01],
          [ 1.3911e+01,  2.9846e+01,  1.7154e+01,  ..., -2.2602e+01,
           -1.3038e+01, -1.4156e+01]],

         [[ 8.4961e+00,  1.2535e+01, -6.8833e-02,  ..., -1.2001e+00,
           -4.7996e+00, -8.3774e-01],
          [ 4.0529e+00,  8.6046e+00, -7.6709e-02,  ..., -1.1763e+00,
           -4.8314e+00, -8.0779e-01]],

         ...,

         [[ 5.0931e-02, -1.6246e-02,  8.1112e-02,  ..., -1.2140e+01,
           -1.1836e+01, -1.0581e

In [2]:
from peft import LoraConfig, get_peft_model

# 定义LoRA的微调配置
lora_config = LoraConfig(
    r=16,  # LoRA秩参数
    lora_alpha=32,  # LoRA的alpha值，控制学习率
    target_modules=["q_proj", "v_proj"],  # 需要进行LoRA调节的层
    lora_dropout=0.05,  # Dropout比率
    bias="none",  # 不使用偏置项
    task_type="CAUSAL_LM"  # 任务类型是因果语言模型
)

# 将LoRA应用于模型
lora_model = get_peft_model(model, lora_config)

# 打印微调后的模型结构
print(lora_model)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151936, 1024)
      (layers): ModuleList(
        (0-23): 24 x Qwen2DecoderLayer(
          (self_attn): Qwen2SdpaAttention(
            (q_proj): lora.Linear(
              (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
              (lora_dropout): ModuleDict(
                (default): Dropout(p=0.05, inplace=False)
              )
              (lora_A): ModuleDict(
                (default): Linear(in_features=1024, out_features=16, bias=False)
              )
              (lora_B): ModuleDict(
                (default): Linear(in_features=16, out_features=1024, bias=False)
              )
              (lora_embedding_A): ParameterDict()
              (lora_embedding_B): ParameterDict()
              (lora_magnitude_vector): ModuleDict()
            )
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): lora

In [3]:
import json
from torch.utils.data import Dataset, DataLoader

# 读取自定义的JSON格式问答数据集
with open('./data/SQuAD1.1/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 查看数据集前两个样本结构
print(data[:2])

# 定义自定义数据集类，用于Pytorch中的数据加载
class QADataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        inputs = self.tokenizer(sample["context"], sample["question"], return_tensors="pt", padding="max_length", truncation=True)
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": self.tokenizer(sample["answer"], return_tensors="pt")["input_ids"].squeeze()
        }

# 加载数据集
dataset = QADataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


[{'id': 0, 'answer': 'Saint Bernadette Soubirous', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'context': 'It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858.', 'p_phrase': ['Lourdes', 'France', '1858', 'the Virgin Mary', 'appeared'], 'n_phrase': ['Saint Bernadette Soubirous'], 'full answer': 'The Virgin Mary allegedly appeared in 1858 in Lourdes France to Saint Bernadette Soubirous.'}, {'id': 1, 'answer': 'a copper statue of Christ', 'question': 'What is in front of the Notre Dame Main Building?', 'context': 'Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes".', 'p_phrase': ['the Main Building', 'Christ', 'front'], 'n_phrase': ['a copper statue', 'facing'], 'full answer': 'A copper statue of Christ is in front of the Notre Dame Main Building.'}]


In [5]:
from torch.utils.data import DataLoader, Dataset

# 自定义数据集类，将数据转换为PyTorch张量
class SQuADDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            item["question"], 
            item["context"], 
            truncation=True, 
            padding="max_length", 
            max_length=512,
            return_tensors="pt"
        )
        encoding["labels"] = torch.tensor(item["answer"], dtype=torch.long)
        return encoding

# 实例化数据集
dataset = SQuADDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [7]:
# 打印模型的所有模块
for name, module in model.named_modules():
    print(name)



embed_tokens
layers
layers.0
layers.0.self_attn
layers.0.self_attn.q_proj
layers.0.self_attn.q_proj.base_layer
layers.0.self_attn.q_proj.lora_dropout
layers.0.self_attn.q_proj.lora_dropout.default
layers.0.self_attn.q_proj.lora_A
layers.0.self_attn.q_proj.lora_A.default
layers.0.self_attn.q_proj.lora_B
layers.0.self_attn.q_proj.lora_B.default
layers.0.self_attn.q_proj.lora_embedding_A
layers.0.self_attn.q_proj.lora_embedding_B
layers.0.self_attn.q_proj.lora_magnitude_vector
layers.0.self_attn.k_proj
layers.0.self_attn.v_proj
layers.0.self_attn.v_proj.base_layer
layers.0.self_attn.v_proj.lora_dropout
layers.0.self_attn.v_proj.lora_dropout.default
layers.0.self_attn.v_proj.lora_A
layers.0.self_attn.v_proj.lora_A.default
layers.0.self_attn.v_proj.lora_B
layers.0.self_attn.v_proj.lora_B.default
layers.0.self_attn.v_proj.lora_embedding_A
layers.0.self_attn.v_proj.lora_embedding_B
layers.0.self_attn.v_proj.lora_magnitude_vector
layers.0.self_attn.o_proj
layers.0.self_attn.rotary_emb
layers.

In [9]:
from transformers import AutoModelForCausalLM

model_path = "./model/qwen0.5b"
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto", device_map="auto")
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (up_proj): Linear(in_features=1024, out_features=2816, bias=False)
          (down_proj): Linear(in_features=2816, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): 