In [None]:
%pip install transformers datasets torchaudio soundfile peft jiwer

Note: you may need to restart the kernel to use updated packages.


In [None]:
import torch
import torchaudio
import pandas as pd
from datasets import Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
from dataclasses import dataclass
from typing import Any

In [None]:
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
csv_path = "Dataset/dataset.csv"
df = pd.read_csv(csv_path)
dataset = Dataset.from_pandas(df)

In [None]:
def preprocess(batch):
    
    speech_array, sr = torchaudio.load(batch["path"])
    if sr != 16000:
        speech_array = torchaudio.functional.resample(speech_array, sr, 16000)
    batch["input_values"] = processor(speech_array.squeeze().numpy(), sampling_rate=16000).input_values[0]
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids
    return batch


dataset = dataset.map(preprocess)


Map: 100%|██████████| 28111/28111 [00:30<00:00, 908.71 examples/s] 


In [None]:
@dataclass
class MyDataCollatorCTC:
    processor: Any

    def __call__(self, features):

        input_values = [torch.tensor(f["input_values"], dtype=torch.float32) for f in features]
        labels = [torch.tensor(f["labels"], dtype=torch.long) for f in features]

        batch_inputs = self.processor.feature_extractor.pad(
            {"input_values": input_values},
            return_tensors="pt"
        )


        labels_batch = self.processor.tokenizer.pad(
            {"input_ids": labels},
            return_tensors="pt"
        )


        labels = labels_batch["input_ids"].masked_fill(
            labels_batch["input_ids"] == self.processor.tokenizer.pad_token_id, -100
        )


        batch_inputs["labels"] = labels
        return batch_inputs


data_collator = MyDataCollatorCTC(processor)

In [None]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=3e-4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    fp16=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator, 
)


trainer.train()

  trainer = Trainer(


Step,Training Loss
10,5178.3016
20,5029.1137
30,2835.9613
40,725.6569
50,794.1093
60,392.652
70,797.3741
80,656.261
90,1008.2175
100,630.8863


KeyboardInterrupt: 