Sat Jun 29 11:15:18 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|=========================================+======================+======================|
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:15:00.0 Off |                  Off |
| 30%   54C    P2             330W / 450W |  24062MiB / 24564MiB |     99%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        Off | 00000000:18:00.0 Off |                  Off |
| 30%   34C    P2              56W / 450W |  14394MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                                         
+---------------------------------------------------------------------------------------+
| Processes:                                                                            |
|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |
|        ID   ID                                                             Usage      |
|=======================================================================================|
+---------------------------------------------------------------------------------------+

In [None]:
# 导入必要的库
from datasets import load_dataset, DatasetDict, Audio
from transformers import AutoFeatureExtractor, AutoTokenizer, AutoProcessor, AutoModelForSpeechSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, PeftModel ,PeftConfig
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# 全局参数设置
base_path = "/root/dataDisk/hf/hub"
model_name_or_path = base_path + "/models/whisper-large-v2"
model_dir = "models/whisper-large-v2-asr-int8"
language = "Chinese (China)"
language_abbr = "zh-CN"
task = "transcribe"
dataset_path = base_path +"/datasets/common_voice_11_0/"
batch_size = 64

# 加载数据集
common_voice = DatasetDict()
common_voice["train"] = load_dataset(dataset_path+"common_voice_11_0.py", language_abbr, split="train", cache_dir=dataset_path, trust_remote_code=True)
common_voice["validation"] = load_dataset(dataset_path+"common_voice_11_0.py", language_abbr, split="validation", cache_dir=dataset_path, trust_remote_code=True)

print("数据集加载完成")

# 数据预处理
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path)
print("特征提取器加载完成")
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, language=language, task=task)
print("分词器加载完成")
processor = AutoProcessor.from_pretrained(model_name_or_path, language=language, task=task)
print("处理器加载完成")

# 移除数据集中不必要的字段
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

print("数据预处理完成")

# 数据预处理函数
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# 应用数据预处理
tokenized_common_voice = common_voice.map(prepare_dataset, num_proc=2)

print("数据预处理应用完成")

# 数据整理器定义
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

print("数据整理器实例化完成")

# 加载和配置模型
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name_or_path, load_in_8bit=True, device_map="auto")
model = prepare_model_for_int8_training(model)

print("模型加载和配置完成")

# LoRA配置
config = LoraConfig(
    r=4, lora_alpha=64, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none"
)
peft_model = get_peft_model(model, config)

print("LoRA配置完成")

# 自定义回调类
class TrainEvalCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        # 打印训练和验证损失
        print(f"Step: {state.global_step}, Training Loss: {logs.get('loss', None)}, Validation Loss: {logs.get('eval_loss', None)}")

# 训练设置
training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir, per_device_train_batch_size=batch_size, learning_rate=1e-3,
    num_train_epochs=1, evaluation_strategy="epoch", per_device_eval_batch_size=batch_size,
    generation_max_length=128, logging_steps=10, remove_unused_columns=False, label_names=["labels"]
)

trainer = Seq2SeqTrainer(
    args=training_args, model=peft_model, train_dataset=tokenized_common_voice["train"],
    eval_dataset=tokenized_common_voice["validation"], data_collator=data_collator, tokenizer=processor.feature_extractor, callbacks=[TrainEvalCallback]  # 添加自定义回调
)
peft_model.config.use_cache = False
trainer.train()
trainer.save_model(model_dir)

print("模型训练和保存完成")

# 清理 GPU 内存
torch.cuda.empty_cache()

# 指定设备
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 重新加载模型
peft_config = LoraConfig.from_pretrained(model_dir)
base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    peft_config.base_model_name_or_path, 
    load_in_8bit=True, 
    device_map="auto"
)
peft_model = PeftModel.from_pretrained(base_model, model_dir)
peft_model.to(device)  # 将模型移动到指定设备
peft_model.eval()

tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
processor = AutoProcessor.from_pretrained(peft_config.base_model_name_or_path, language=language, task=task)
feature_extractor = processor.feature_extractor

# 语音识别管道
test_audio = "data/audio/test_zh.flac"
from transformers import AutomaticSpeechRecognitionPipeline
pipeline = AutomaticSpeechRecognitionPipeline(
    model=peft_model, 
    tokenizer=tokenizer, 
    feature_extractor=feature_extractor,
    device=device  # 指定设备
)

# 读取音频文件
import librosa
audio, rate = librosa.load(test_audio, sr=16000)

# 进行推理
with torch.no_grad():  # 禁用梯度计算
    inputs = feature_extractor(audio, sampling_rate=rate, return_tensors="pt").to(device)
    generated_ids = peft_model.generate(inputs=inputs["input_features"])
    transcription = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(transcription)

  from .autonotebook import tqdm as notebook_tqdm


数据集加载完成
特征提取器加载完成


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


分词器加载完成
处理器加载完成
数据预处理完成
数据预处理应用完成
数据整理器实例化完成




模型加载和配置完成
LoRA配置完成


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


Step: 10, Training Loss: 1.4941, Validation Loss: None
Step: 20, Training Loss: 0.5236, Validation Loss: None
Step: 30, Training Loss: 0.4354, Validation Loss: None
Step: 40, Training Loss: 0.4068, Validation Loss: None
Step: 50, Training Loss: 0.4098, Validation Loss: None
Step: 60, Training Loss: 0.3706, Validation Loss: None
Step: 70, Training Loss: 0.4023, Validation Loss: None
Step: 80, Training Loss: 0.3496, Validation Loss: None
Step: 90, Training Loss: 0.4164, Validation Loss: None
Step: 100, Training Loss: 0.35, Validation Loss: None
Step: 110, Training Loss: 0.319, Validation Loss: None
Step: 120, Training Loss: 0.3467, Validation Loss: None
Step: 130, Training Loss: 0.3424, Validation Loss: None
Step: 140, Training Loss: 0.3512, Validation Loss: None
Step: 150, Training Loss: 0.3087, Validation Loss: None
Step: 160, Training Loss: 0.3456, Validation Loss: None
Step: 170, Training Loss: 0.3796, Validation Loss: None
Step: 180, Training Loss: 0.3303, Validation Loss: None
Step

### 运行结果
```PLAINTEXT
数据集加载完成
特征提取器加载完成
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
分词器加载完成
处理器加载完成
数据预处理完成
数据预处理应用完成
数据整理器实例化完成
/root/.local/lib/python3.10/site-packages/peft/utils/other.py:141: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.
  warnings.warn(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
模型加载和配置完成
LoRA配置完成
/root/.local/lib/python3.10/site-packages/torch/utils/checkpoint.py:464: UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.4 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
/root/.local/lib/python3.10/site-packages/torch/utils/checkpoint.py:91: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
  warnings.warn(
 [454/454 2:42:35, Epoch 1/1]
Epoch	Training Loss	Validation Loss
1	0.326500	0.385433
Step: 10, Training Loss: 1.4963, Validation Loss: None
Step: 20, Training Loss: 0.474, Validation Loss: None
Step: 30, Training Loss: 0.4387, Validation Loss: None
Step: 40, Training Loss: 0.4131, Validation Loss: None
Step: 50, Training Loss: 0.4139, Validation Loss: None
Step: 60, Training Loss: 0.3713, Validation Loss: None
Step: 70, Training Loss: 0.4005, Validation Loss: None
Step: 80, Training Loss: 0.3485, Validation Loss: None
Step: 90, Training Loss: 0.4171, Validation Loss: None
Step: 100, Training Loss: 0.3476, Validation Loss: None
Step: 110, Training Loss: 0.3189, Validation Loss: None
Step: 120, Training Loss: 0.3429, Validation Loss: None
Step: 130, Training Loss: 0.3466, Validation Loss: None
Step: 140, Training Loss: 0.3538, Validation Loss: None
Step: 150, Training Loss: 0.3062, Validation Loss: None
Step: 160, Training Loss: 0.3447, Validation Loss: None
Step: 170, Training Loss: 0.3759, Validation Loss: None
Step: 180, Training Loss: 0.3284, Validation Loss: None
Step: 190, Training Loss: 0.3307, Validation Loss: None
Step: 200, Training Loss: 0.3748, Validation Loss: None
Step: 210, Training Loss: 0.3425, Validation Loss: None
Step: 220, Training Loss: 0.3201, Validation Loss: None
Step: 230, Training Loss: 0.3682, Validation Loss: None
Step: 240, Training Loss: 0.3846, Validation Loss: None
Step: 250, Training Loss: 0.3589, Validation Loss: None
Step: 260, Training Loss: 0.364, Validation Loss: None
Step: 270, Training Loss: 0.3685, Validation Loss: None
Step: 280, Training Loss: 0.3167, Validation Loss: None
Step: 290, Training Loss: 0.4082, Validation Loss: None
Step: 300, Training Loss: 0.3292, Validation Loss: None
Step: 310, Training Loss: 0.3433, Validation Loss: None
Step: 320, Training Loss: 0.3413, Validation Loss: None
Step: 330, Training Loss: 0.3601, Validation Loss: None
Step: 340, Training Loss: 0.3271, Validation Loss: None
Step: 350, Training Loss: 0.3237, Validation Loss: None
Step: 360, Training Loss: 0.3121, Validation Loss: None
Step: 370, Training Loss: 0.3099, Validation Loss: None
Step: 380, Training Loss: 0.3259, Validation Loss: None
Step: 390, Training Loss: 0.3085, Validation Loss: None
Step: 400, Training Loss: 0.293, Validation Loss: None
Step: 410, Training Loss: 0.3535, Validation Loss: None
Step: 420, Training Loss: 0.3815, Validation Loss: None
Step: 430, Training Loss: 0.346, Validation Loss: None
Step: 440, Training Loss: 0.3522, Validation Loss: None
Step: 450, Training Loss: 0.3265, Validation Loss: None
Step: 454, Training Loss: None, Validation Loss: 0.3854326605796814
Step: 454, Training Loss: None, Validation Loss: None
模型训练和保存完成
```