## 加载数据集,微调,保存模型

In [3]:
# 导入必要的库
from datasets import load_dataset, DatasetDict, Audio
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor, AutoModelForSpeechSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, PeftModel ,PeftConfig
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# 全局参数设置
base_path = "/root/dataDisk/hf/hub"
model_name_or_path = base_path + "/models/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8"
language = "Chinese (China)"
language_abbr = "zh-CN"
task = "transcribe"
dataset_path = base_path +"/datasets/common_voice_11_0/"
batch_size = 64

# 加载数据集
common_voice = DatasetDict()
common_voice["train"] = load_dataset(dataset_path+"common_voice_11_0.py", language_abbr, split="train", cache_dir=dataset_path, trust_remote_code=True)
common_voice["validation"] = load_dataset(dataset_path+"common_voice_11_0.py", language_abbr, split="validation", cache_dir=dataset_path,trust_remote_code=True)

print("数据集加载完成")

# 数据预处理
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(model_name_or_path, language=language, task=task)

# 移除数据集中不必要的字段
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

print("数据预处理完成")

# 数据预处理函数
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# 应用数据预处理
tokenized_common_voice = common_voice.map(prepare_dataset,num_proc=2)

print("数据预处理应用完成")

# 数据整理器定义
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

print("数据整理器实例化完成")

# 加载和配置模型
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")
model = prepare_model_for_int8_training(model)

print("模型加载和配置完成")

# LoRA配置
config = LoraConfig(
    r=4, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none"
)
peft_model = get_peft_model(model, config)

print("LoRA配置完成")

# 训练设置
training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir, 
    per_device_train_batch_size=batch_size, 
    learning_rate=1e-3,
    num_train_epochs=1, 
    evaluation_strategy="steps",  # 修改为每步骤进行评估
    eval_steps=50,  # 每50步骤进行一次评估
    per_device_eval_batch_size=batch_size,
    generation_max_length=128, 
    logging_steps=10,  # 可以减少日志记录的步骤数，以便更频繁记录
    remove_unused_columns=False, 
    label_names=["labels"]
)

trainer = Seq2SeqTrainer(
    args=training_args, 
    model=peft_model, 
    train_dataset=tokenized_common_voice["train"],
    eval_dataset=tokenized_common_voice["validation"], 
    data_collator=data_collator, 
    tokenizer=processor.feature_extractor
)
peft_model.config.use_cache = False
trainer.train()
trainer.save_model(model_dir)

print("模型训练和保存完成")

peft_model.eval()

数据集加载完成


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


数据预处理完成
数据预处理应用完成
数据整理器实例化完成


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


模型加载和配置完成
LoRA配置完成




Step,Training Loss,Validation Loss
50,0.4158,0.44473
100,0.35,0.425002
150,0.3073,0.415866
200,0.3751,0.409702
250,0.3604,0.402729
300,0.3281,0.397587
350,0.3213,0.391273
400,0.2941,0.387852
450,0.3261,0.385176


模型训练和保存完成


PeftModel(
  (base_model): LoraModel(
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(80, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear8bitLt(in_features=1280, out_features=1280, bias=False)
                (v_proj): lora.Linear8bitLt(
                  (base_layer): Linear8bitLt(in_features=1280, out_features=1280, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1280, out_features=4, bias=False)
                  )
            

## 推理,识别

In [2]:
# 推理（可能需要重启 Notebook 来清除内存）
peft_config = PeftConfig.from_pretrained(model_dir)
base_model = AutoModelForSpeechSeq2Seq.from_pretrained(peft_config.base_model_name_or_path, load_in_8bit=True, device_map="auto")
peft_model = PeftModel.from_pretrained(base_model, model_dir)
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor

# 语音识别管道
test_audio = "data/audio/test_zh.flac"
from transformers import AutomaticSpeechRecognitionPipeline
pipeline = AutomaticSpeechRecognitionPipeline(model=peft_model, tokenizer=tokenizer, feature_extractor=feature_extractor)
with torch.cuda.amp.autocast():
    text = pipeline(test_audio, max_new_tokens=255)["text"]
print(text)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


这是一段测试用于WhisperLarge V2模型的自动语音识别测试。
