In [None]:
!pip install transformers datasets torchaudio accelerate peft bitsandbytes peft

In [None]:
import torch
import torchaudio
import numpy as np
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import LoraConfig, get_peft_model, TaskType



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
model_name = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)


In [4]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, lora_config)


Exception in thread Thread-6 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\follo\anaconda3\envs\GPU\Lib\threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "C:\Users\follo\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\follo\anaconda3\envs\GPU\Lib\threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\follo\anaconda3\envs\GPU\Lib\subprocess.py", line 1599, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 7: invalid start byte


In [5]:
data_files = {
    "train": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\train.csv",
    "validation": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\val.csv",
    "test": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\test.csv"
}

dataset = load_dataset("csv", data_files=data_files)

train_dataset = dataset["train"]
val_dataset   = dataset["validation"]
test_dataset  = dataset["test"]

Generating train split: 220901 examples [00:00, 251340.75 examples/s]
Generating validation split: 27613 examples [00:00, 251987.68 examples/s]
Generating test split: 27613 examples [00:00, 264880.89 examples/s]


In [None]:
def preprocess(batch):

    results = {"input_features": [], "labels": []}


    path = batch["path"]
    text = batch["text"]


    if not text or len(text.strip()) == 0:
        print(f"Пропуск пустого текста: {path}")
        return None


    try:
        speech_array, sr = torchaudio.load(path)
    except Exception as e:
        print(f"Ошибка загрузки {path}: {e}")
        return None

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        speech_array = resampler(speech_array)
    speech_array = speech_array.squeeze().numpy()


    try:
        input_features = processor.feature_extractor(
            speech_array,
            sampling_rate=16000,
            return_tensors="pt"
        )["input_features"][0].numpy()
    except Exception as e:
        print(f"Ошибка feature_extractor {path}: {e}")
        return None


    try:
        labels = processor.tokenizer(
            text_target=text,
            return_tensors="pt"
        )["input_ids"][0].numpy()
    except Exception as e:
        print(f"Ошибка tokenizer {path}: {e}")
        return None


    results["input_features"].append(input_features.tolist())
    results["labels"].append(labels.tolist())
    return results


In [7]:
# def preprocess(batch):
#     # Загрузка и ресэмплинг аудио
#     speech_array, sampling_rate = torchaudio.load(batch["path"])
#     if sr != 16000:
#         wav = torchaudio.transforms.Resample(sr, 16000)(wav)
#     wav = wav.mean(dim=0)  # Стерео -> моно 6269
#     input_features = processor(wav.numpy(), sampling_rate=16000).input_features[0]
#     labels = processor.tokenizer(batch['text']).input_ids
#     return {'input_features': input_features, 'labels': labels}

In [None]:
train_dataset = train_dataset.map(
    preprocess,
    batched=False,
    remove_columns=train_dataset.column_names
)


val_dataset = val_dataset.map(
    preprocess,
    batched=False,
    remove_columns=val_dataset.column_names
)

Map:   3%|▎         | 6271/220901 [04:44<1:02:37, 57.12 examples/s] 

Пропуск пустого текста: Dataset/cv-corpus-23.0-2025-09-05/ru/clips_wav/common_voice_ru_39548378.wav


Map:   7%|▋         | 16426/220901 [13:45<1:09:25, 49.08 examples/s]  

Пропуск пустого текста: Dataset/cv-corpus-23.0-2025-09-05/ru/clips_wav/common_voice_ru_41421350.wav


Map:   8%|▊         | 18104/220901 [15:23<1:15:09, 44.97 examples/s]  

Пропуск пустого текста: Dataset/cv-corpus-23.0-2025-09-05/ru/clips_wav/common_voice_ru_39709604.wav


Map:   9%|▉         | 19876/220901 [16:38<1:07:18, 49.78 examples/s] 

Пропуск пустого текста: Dataset/cv-corpus-23.0-2025-09-05/ru/clips_wav/common_voice_ru_41429562.wav


Map:   9%|▉         | 20118/220901 [17:14<1:15:18, 44.44 examples/s] 

Пропуск пустого текста: Dataset/cv-corpus-23.0-2025-09-05/ru/clips_wav/common_voice_ru_41438142.wav


Map:  15%|█▍        | 32285/220901 [27:47<1:02:16, 50.48 examples/s]  

Пропуск пустого текста: Dataset/cv-corpus-23.0-2025-09-05/ru/clips_wav/common_voice_ru_39228707.wav


Map:  18%|█▊        | 38816/220901 [33:12<58:12, 52.14 examples/s]    

Пропуск пустого текста: Dataset/cv-corpus-23.0-2025-09-05/ru/clips_wav/common_voice_ru_40171768.wav


Map:  19%|█▉        | 42589/220901 [36:30<56:28, 52.62 examples/s]    

Пропуск пустого текста: Dataset/cv-corpus-23.0-2025-09-05/ru/clips_wav/common_voice_ru_38403827.wav


Map:  23%|██▎       | 50007/220901 [44:55<2:33:31, 18.55 examples/s]  


OSError: [Errno 28] No space left on device

In [None]:
print(train_dataset.column_names)
print(train_dataset[0])

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./Model/whisper-LoRA",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    num_train_epochs=3,
    fp16=True,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


trainer.train()

In [None]:
model.save_pretrained("./Model/whisper-LoRA")
processor.save_pretrained("./Model/whisper-LoRA")