In [1]:
import torch
import torchaudio
import numpy as np
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import LoraConfig, get_peft_model, TaskType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
model_name = "openai/whisper-base"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)


In [4]:
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)
model = get_peft_model(model, lora_config)


Exception in thread Thread-4 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\follo\anaconda3\envs\GPU\Lib\threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "C:\Users\follo\AppData\Roaming\Python\Python312\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\follo\anaconda3\envs\GPU\Lib\threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\follo\anaconda3\envs\GPU\Lib\subprocess.py", line 1599, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 7: invalid start byte


In [5]:
data_files = {
    "train": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\train.csv",
    "validation": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\val.csv",
    "test": r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Dataset\test.csv"
}

dataset = load_dataset("csv", data_files=data_files)

train_dataset = dataset["train"]
val_dataset   = dataset["validation"]
test_dataset  = dataset["test"]

In [6]:
def preprocess(batch):

    results = {"input_features": [], "labels": []}


    path = batch["path"]
    text = batch["text"]


    if not text or len(text.strip()) == 0:
        print(f"Пропуск пустого текста: {path}")
        return None


    try:
        speech_array, sr = torchaudio.load(path)
    except Exception as e:
        print(f"Ошибка загрузки {path}: {e}")
        return None

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        speech_array = resampler(speech_array)
    speech_array = speech_array.squeeze().numpy()


    try:
        input_features = processor.feature_extractor(
            speech_array,
            sampling_rate=16000,
            return_tensors="pt"
        )["input_features"][0].numpy()
    except Exception as e:
        print(f"Ошибка feature_extractor {path}: {e}")
        return None


    try:
        labels = processor.tokenizer(
            text_target=text,
            return_tensors="pt"
        )["input_ids"][0].numpy()
    except Exception as e:
        print(f"Ошибка tokenizer {path}: {e}")
        return None


    results["input_features"].append(input_features.tolist())
    results["labels"].append(labels.tolist())
    return results


In [7]:
# def preprocess(batch):
#     # Загрузка и ресэмплинг аудио
#     speech_array, sampling_rate = torchaudio.load(batch["path"])
#     if sr != 16000:
#         wav = torchaudio.transforms.Resample(sr, 16000)(wav)
#     wav = wav.mean(dim=0)  # Стерео -> моно 6269
#     input_features = processor(wav.numpy(), sampling_rate=16000).input_features[0]
#     labels = processor.tokenizer(batch['text']).input_ids
#     return {'input_features': input_features, 'labels': labels}

In [9]:
train_dataset = train_dataset.map(
    preprocess,
    batched=False,
    remove_columns=train_dataset.column_names
)


val_dataset = val_dataset.map(
    preprocess,
    batched=False,
    remove_columns=val_dataset.column_names
)

Map:   2%|▏         | 1454/82616 [00:00<00:11, 7232.89 examples/s]

Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_22797202.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_22797202.mp3': System error.
Ошибка загрузки Dataset/asr_calls_2_val/1/7e/cd5319e9b36e.wav: Error opening 'Dataset/asr_calls_2_val/1/7e/cd5319e9b36e.wav': System error.
Ошибка загрузки Dataset/public_youtube700_val/a/b4/8e7c7d9ecc50.wav: Error opening 'Dataset/public_youtube700_val/a/b4/8e7c7d9ecc50.wav': System error.
Ошибка загрузки Dataset/public_youtube700_val/e/b2/91fd745d785a.wav: Error opening 'Dataset/public_youtube700_val/e/b2/91fd745d785a.wav': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_42110704.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_42110704.mp3': System error.
Ошибка загрузки Dataset/buriy_audiobooks_2_val/0/cc/a45dff1b208a.wav: Error opening 'Dataset/buriy_audiobooks_2_val/0/cc/a45dff1b208a.wav': System error.
Ошибка загрузки Da

Map:   3%|▎         | 2512/82616 [00:00<00:11, 7101.82 examples/s]

Ошибка загрузки Dataset/asr_calls_2_val/7/10/9726f7516363.wav: Error opening 'Dataset/asr_calls_2_val/7/10/9726f7516363.wav': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_18875414.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_18875414.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_26562765.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_26562765.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_22386355.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_22386355.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_42558673.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_42558673.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_36984865.mp3: Err

Map:   4%|▍         | 3374/82616 [00:00<00:12, 6234.65 examples/s]

Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_38328528.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_38328528.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_24063957.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_24063957.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_32345762.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_32345762.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_25693418.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_25693418.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_27645353.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_27645353.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2

Map:   6%|▌         | 4756/82616 [00:00<00:11, 6569.76 examples/s]

Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_20494724.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_20494724.mp3': System error.
Ошибка загрузки Dataset/buriy_audiobooks_2_val/8/fd/78cad20ca431.wav: Error opening 'Dataset/buriy_audiobooks_2_val/8/fd/78cad20ca431.wav': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_33433878.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_33433878.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_38505657.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_38505657.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_20489577.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_20489577.mp3': System error.
Ошибка загрузки Dataset/asr_calls_2_val/4/66/2845c87337ad.wav: Error opening '

Map:   7%|▋         | 6068/82616 [00:00<00:11, 6701.56 examples/s]


Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_20250022.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_20250022.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_27355442.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_27355442.mp3': System error.
Ошибка загрузки Dataset/asr_calls_2_val/b/13/b5c4a0f5e8ff.wav: Error opening 'Dataset/asr_calls_2_val/b/13/b5c4a0f5e8ff.wav': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_20247047.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_20247047.mp3': System error.
Ошибка загрузки Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_22925580.mp3: Error opening 'Dataset/cv-corpus-23.0-2025-09-05/ru/clips/common_voice_ru_22925580.mp3': System error.
Ошибка загрузки Dataset/buriy_audiobooks_2_val/9/bd/d93bfcd5d598.wav: Error opening 'Dataset

KeyboardInterrupt: 

In [None]:
print(train_dataset.column_names)
print(train_dataset[0])

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Model\whisper-LoRA",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    num_train_epochs=3,
    fp16=True,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


trainer.train()

In [None]:
model.save_pretrained(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Model\whisper-LoRA")
processor.save_pretrained(r"E:\Education\4 course 1 semester\Course project\Shards_prodject\Code\Task\Model\whisper-LoRA")