### Введение

В данном ноутбуке попробую дообучить предобученную модель, посмотрим, что из этого выйдет)

### Imports

In [1]:
import pandas as pd
import torch
import torchaudio
from datasets import Dataset, Audio
import re
from collections import Counter
from transformers import (
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2ForCTC,
    TrainingArguments,
    Trainer,
)
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


### Prepare the Dataset

In [2]:
# Загрузка данных
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

# Переименование колонок
train_df = train_df.rename(columns={"id": "file", "message": "text"})
test_df = test_df.rename(columns={"id": "file"})

# Формируем пути к аудиофайлам
AUDIO_PATH = "data/morse_dataset/morse_dataset"
train_df["file"] = train_df["file"].apply(lambda x: f"{AUDIO_PATH}/{x}")
test_df["file"] = test_df["file"].apply(lambda x: f"{AUDIO_PATH}/{x}")

### Define Vocabulary

Create a vocab from my dataset (Russian + digits + symbols)

In [3]:
# Создание словаря
def extract_chars(text_series):
    all_text = "".join(text_series)
    return sorted(set(re.sub(r"\s+", "", all_text)))

vocab_chars = extract_chars(train_df["text"])
vocab_dict = {c: i for i, c in enumerate(vocab_chars)}

if " " in vocab_dict:
    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]

vocab_dict["[PAD]"] = len(vocab_dict)
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[BLANK]"] = len(vocab_dict)

In [4]:
# Сохраняем словарь
with open("vocab.json", "w", encoding="utf-8") as f:
    json.dump(vocab_dict, f, ensure_ascii=False, indent=2)

In [5]:
# Проверка словаря
print("Vocabulary:", vocab_dict)
print("Vocab size:", len(vocab_dict))

# Проверка отсутствующих символов
all_chars = set("".join(train_df["text"]))
missing_chars = all_chars - set(vocab_dict.keys())
if missing_chars:
    print(f"Missing characters in vocab: {missing_chars}")

Vocabulary: {'#': 0, '0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'А': 11, 'Б': 12, 'В': 13, 'Г': 14, 'Д': 15, 'Е': 16, 'Ж': 17, 'З': 18, 'И': 19, 'Й': 20, 'К': 21, 'Л': 22, 'М': 23, 'Н': 24, 'О': 25, 'П': 26, 'Р': 27, 'С': 28, 'Т': 29, 'У': 30, 'Ф': 31, 'Х': 32, 'Ц': 33, 'Ч': 34, 'Ш': 35, 'Щ': 36, 'Ъ': 37, 'Ы': 38, 'Ь': 39, 'Э': 40, 'Ю': 41, 'Я': 42, '[PAD]': 43, '[UNK]': 44, '[BLANK]': 45}
Vocab size: 46
Missing characters in vocab: {' '}


### Preprocess Audio + Transcripts

In [6]:
# Инициализация токенизатора
tokenizer = Wav2Vec2CTCTokenizer(
    vocab_file="vocab.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|",
    bos_token=None,  # Отключаем начальный токен
    eos_token=None,  # Отключаем конечный токен
)

# Инициализация feature extractor
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=8000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True,
)

# Инициализация процессора
processor = Wav2Vec2Processor(
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
)

processor.save_pretrained("my_morse_tokenizer")

[]

In [7]:
# Подготовка датасета
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.cast_column("file", Audio())
test_dataset = test_dataset.cast_column("file", Audio())

In [8]:
def preprocess(example):
    audio = example["file"]
    inputs = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt")
    example["input_values"] = inputs.input_values[0]
    example["attention_mask"] = inputs.attention_mask[0]
    # Токенизируем текст
    text = example["text"]
    tokenized = processor.tokenizer(text, is_split_into_words=False, return_tensors="pt").input_ids[0]
    example["labels"] = tokenized.tolist()
    # Проверка меток
    if max(tokenized) >= len(vocab_dict):
        print(f"Invalid labels in example: {text}, labels={tokenized.tolist()}")
    return example

train_dataset = train_dataset.map(preprocess, remove_columns=["file", "text"])

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map: 100%|██████████| 30000/30000 [12:16<00:00, 40.72 examples/s]  


### Define Model

"facebook/wav2vec2-large-xlsr-53"

In [9]:
# Инициализация модели
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=tokenizer.pad_token_id,
    vocab_size=len(vocab_dict),  # Устанавливаем vocab_size=46
)
model.lm_head = torch.nn.Linear(model.config.hidden_size, len(vocab_dict))  # Синхронизируем lm_head
model.freeze_feature_encoder()

  return self.fget.__get__(instance, owner)()
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Проверка размеров словаря
print(f"Model vocab size: {model.config.vocab_size}")
print(f"Tokenizer vocab size: {len(tokenizer)}")

Model vocab size: 46
Tokenizer vocab size: 46


In [11]:
print("Tokenizer vocab:", tokenizer.get_vocab())
print("Tokenizer vocab size:", len(tokenizer))

Tokenizer vocab: {'#': 0, '0': 1, '1': 2, '2': 3, '3': 4, '4': 5, '5': 6, '6': 7, '7': 8, '8': 9, '9': 10, 'А': 11, 'Б': 12, 'В': 13, 'Г': 14, 'Д': 15, 'Е': 16, 'Ж': 17, 'З': 18, 'И': 19, 'Й': 20, 'К': 21, 'Л': 22, 'М': 23, 'Н': 24, 'О': 25, 'П': 26, 'Р': 27, 'С': 28, 'Т': 29, 'У': 30, 'Ф': 31, 'Х': 32, 'Ц': 33, 'Ч': 34, 'Ш': 35, 'Щ': 36, 'Ъ': 37, 'Ы': 38, 'Ь': 39, 'Э': 40, 'Ю': 41, 'Я': 42, '[PAD]': 43, '[UNK]': 44, '[BLANK]': 45}
Tokenizer vocab size: 46


### Training Setup

In [12]:
# Настройка обучения
training_args = TrainingArguments(
    output_dir="./asr_model",
    per_device_train_batch_size=8, # 8 для меньшей модели
    eval_strategy="no",
    num_train_epochs=10,
    fp16=True,
    save_steps=500,
    logging_steps=100,
    learning_rate=3e-4,
    warmup_steps=500,
    save_total_limit=2,
    gradient_accumulation_steps=2,  # 2 для меньшей модели
)

# Метрики
def compute_metrics(pred):
    from jiwer import wer, cer
    pred_str = processor.batch_decode(pred.predictions, skip_special_tokens=True)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    return {"cer": cer(label_str, pred_str)}

# Кастомный data_collator для версии 4.36.2
def data_collator(features):
    # Извлекаем input_values, attention_mask и labels из списка features
    input_values = [f["input_values"] for f in features]
    attention_mask = [f["attention_mask"] for f in features]
    labels = [f["labels"] for f in features]

    # Выполняем padding для input_values и attention_mask
    inputs = processor.pad(
        {
            "input_values": input_values,
            "attention_mask": attention_mask,
        },
        padding="longest",  # Выравниваем по самому длинному элементу в батче
        return_tensors="pt",
    )

    # Выполняем padding для labels отдельно
    labels_padded = processor.tokenizer.pad(
        {"input_ids": labels},
        padding="longest",
        return_tensors="pt",
    )["input_ids"]

    # Заменяем pad_token_id в labels на -100, чтобы игнорировать его в CTC loss
    labels_padded = labels_padded.masked_fill(labels_padded == processor.tokenizer.pad_token_id, -100)

    # Возвращаем словарь с подготовленными данными
    inputs["labels"] = labels_padded
    return inputs


# Инициализация тренера
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics,
)

# Освобождение памяти
torch.cuda.empty_cache()

  trainer = Trainer(


### Train

In [13]:
# Обучение
trainer.train()
trainer.save_model("final_model")
processor.save_pretrained("final_model")

Step,Training Loss
100,26.8221
200,4.047
300,4.0355
400,4.0318
500,4.0355
600,4.0278
700,4.031
800,4.0328
900,4.0307
1000,4.0279


[]

### Inference & submit

In [15]:
test_dataset = test_dataset.map(lambda x: processor(x["file"]["array"], sampling_rate=8000, return_tensors="pt"), remove_columns=["file"])

def predict(batch):
    with torch.no_grad():
        input_values = batch["input_values"].squeeze(0).to("cuda")
        logits = model(input_values.unsqueeze(0)).logits
        pred_ids = torch.argmax(logits, dim=-1)
        return processor.batch_decode(pred_ids)[0]

submission = []

for i, sample in enumerate(test_dataset):
    transcript = predict(sample)
    submission.append({"id": f"{30001+i}.opus", "message": transcript})

pd.DataFrame(submission).to_csv("submissions.csv", index=False, encoding='utf-8')

Map: 100%|██████████| 5000/5000 [02:06<00:00, 39.48 examples/s]


AttributeError: 'list' object has no attribute 'squeeze'

В общем, разбираться даже не хочу
Очевидно, что эксперимент неудачный, поставил на ночь, а ничего не изменилось :D

После завершения соревнования буду разбирать лучшие решения, которые будут доступны, иного варианты попросту нет, в рамках моих ограниченных временных ресурсов.