In [1]:
# import aiohttp
# from datasets import load_dataset
# from fsspec.asyn import FSTimeoutError

# try:
#     dataset = load_dataset(
#         "openslr/librispeech_asr",
#         trust_remote_code=True,
#         token="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",
#     storage_options={'client_kwargs': {'timeout': aiohttp.ClientTimeout(total=60*60*4)}}
#     )
# except FSTimeoutError:
#     print("Dataset loading timed out. Please check your network connection or increase the timeout.")

from datasets import load_from_disk

dataset = load_from_disk("librispeech_processed_1h")

In [2]:
# dataset = dataset.remove_columns(["file", "speaker_id", "chapter_id", "id"])

# dataset.pop("train.other.500")
# dataset.pop("train.clean.360")
# dataset.pop("test.other")
# dataset.pop("validation.other")

In [None]:
# Print splits
print(dataset.keys())
print(dataset['train.clean.100'])

# Print the first example in the train split
print(dataset['train.clean.100'][0])

In [4]:
# TARGET_DURATION = 60 * 60

# train_clean_100_split = dataset['train.clean.100']

# indices_to_keep = []
# current_duration = 0.0

# for i, example in enumerate(train_clean_100_split):
#     audio_array = example['audio']['array']
#     sampling_rate = example['audio']['sampling_rate']
#     duration = audio_array.shape[0] / sampling_rate

#     if current_duration + duration <= TARGET_DURATION:
#         indices_to_keep.append(i)
#         current_duration += duration
#     else:
#         break
    
# dataset['train.clean.100'] = train_clean_100_split.select(indices_to_keep)

# print(f"Filtered 'train.clean.100' split size: {len(dataset['train.clean.100'])}")
# print(f"Total duration of filtered split: {current_duration:.2f} seconds")
# print(f"Average duration of audio files: {current_duration / len(dataset['train.clean.100']):.2f} seconds")


In [5]:
# import re
# chars_to_ignore_regex = r'[,?.!-;:"]'

# def remove_special_characters(batch):
#     batch["text"] = [re.sub(chars_to_ignore_regex, '', text).lower() for text in batch["text"]]
#     return batch

# dataset = dataset.map(remove_special_characters, batched=True)

In [6]:
# vocabs = set()

# for split in ["train.clean.100", "validation.clean", "test.clean"]:
#     for entry in dataset[split]["text"]:
#         vocabs.update(entry)

# print(vocabs)

# vocab_list = list(vocabs)

# vocab_dict = {v: k for k, v in enumerate(vocab_list)}

# vocab_dict["|"] = vocab_dict[" "]
# del vocab_dict[" "]

# vocab_dict["[UNK]"] = len(vocab_dict)
# vocab_dict["[PAD]"] = len(vocab_dict)

# import json
# with open('vocab.json', 'w') as vocab_file:
#     json.dump(vocab_dict, vocab_file)

In [7]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [8]:
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [9]:
# def prepare_dataset(batch):s

dataset = dataset.with_format("torch")

In [10]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch
    
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [11]:
from evaluate import load
import numpy as np

wer_metric = load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [12]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    token="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"
)
model.freeze_feature_extractor()


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./wav2vec2-librispeech",
  # group_by_length=True,
  per_device_train_batch_size=8,
  gradient_accumulation_steps=2,
  eval_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  gradient_checkpointing=True, 
  save_steps=100,
  eval_steps=100,
  logging_steps=20,
  learning_rate=1e-4,
  weight_decay=0.005,

  # warmup_steps=1000,
  warmup_ratio=0.1,
  
  # max_steps=2000,
  # num_train_epochs=1,
  
  save_total_limit=2,
  report_to="wandb",
)


In [14]:
from transformers import Trainer


trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train.clean.100"],
    eval_dataset=dataset["validation.clean"],
    processing_class=processor.feature_extractor,
)

In [15]:
# import wandb
# wandb.login(key="086ee3500420e04eddb84663721d2cf8b7828343")

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdavislee4891[0m ([33mdavislee4891-ohio-state-buckeyes[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss,Validation Loss,Wer
100,2.9842,3.150076,0.999982
200,2.8614,2.975723,0.999982
300,2.8361,2.928574,0.999982
400,0.613,0.721119,0.506912
500,0.1535,0.576236,0.349252
600,0.0837,0.544056,0.317396
700,0.056,0.515999,0.300099
800,0.0446,0.563398,0.299621
900,0.0372,0.574008,0.291736
1000,0.031,0.559782,0.285909




KeyboardInterrupt: 

In [None]:
trainer.save_model("wav2vec2-librispeech-1h")

In [None]:
model = Wav2Vec2ForCTC.from_pretrained("wav2vec2-librispeech-1h").to("cuda")

In [None]:
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
  return batch

results = dataset["test.clean"].map(map_to_result, remove_columns=dataset["test.clean"].column_names)




Map:   0%|          | 0/2620 [00:00<?, ? examples/s]

  input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)


In [None]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test WER: 0.243
