In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

In [3]:
minds = minds.train_test_split(test_size=0.2)

In [4]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 80
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 20
    })
})

In [5]:
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])

In [6]:
minds["train"][0]

{'path': 'C:\\Users\\Raashid\\.cache\\huggingface\\datasets\\downloads\\extracted\\7fc70429acaa74b94cb2249162b5595037f6132cc583e80d04ff50ba698b78f0\\en-US~JOINT_ACCOUNT\\602b9b6705f96973d67943ca.wav',
 'audio': {'path': 'C:\\Users\\Raashid\\.cache\\huggingface\\datasets\\downloads\\extracted\\7fc70429acaa74b94cb2249162b5595037f6132cc583e80d04ff50ba698b78f0\\en-US~JOINT_ACCOUNT\\602b9b6705f96973d67943ca.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ...,  0.        ,
         -0.00024414, -0.00024414]),
  'sampling_rate': 8000},
 'transcription': 'I was hoping to set up a joint account'}

In [7]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds["train"][0]



{'path': 'C:\\Users\\Raashid\\.cache\\huggingface\\datasets\\downloads\\extracted\\7fc70429acaa74b94cb2249162b5595037f6132cc583e80d04ff50ba698b78f0\\en-US~JOINT_ACCOUNT\\602b9b6705f96973d67943ca.wav',
 'audio': {'path': 'C:\\Users\\Raashid\\.cache\\huggingface\\datasets\\downloads\\extracted\\7fc70429acaa74b94cb2249162b5595037f6132cc583e80d04ff50ba698b78f0\\en-US~JOINT_ACCOUNT\\602b9b6705f96973d67943ca.wav',
  'array': array([ 1.51562745e-06, -8.13333463e-06, -1.73749868e-06, ...,
         -3.29912931e-04, -2.50394165e-04, -9.04533663e-05]),
  'sampling_rate': 16000},
 'transcription': 'I was hoping to set up a joint account'}

In [8]:
def uppercase(example):
    return {"transcription": example["transcription"].upper()}


minds = minds.map(uppercase)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [9]:
def prepare_dataset(batch):
    from transformers import AutoProcessor
    processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
    audio = batch["audio"]
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
    batch["input_length"] = len(batch["input_values"][0])
    return batch

In [10]:
encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/80 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/20 [00:00<?, ? examples/s]

In [11]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [12]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [13]:
import numpy as np

def compute_metrics(pred):
    import evaluate
    wer = evaluate.load("wer")
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [14]:
from transformers import AutoModelForCTC, TrainingArguments, Trainer

model = AutoModelForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="my_awesome_asr_mind_model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Step,Training Loss,Validation Loss,Wer
5,58.3347,30.390574,1.206522
10,49.6382,30.390574,1.206522
15,47.2021,26.633362,1.01087
20,28.9906,22.226849,1.0
25,28.247,18.924215,1.0
30,27.9882,15.486674,1.0
35,22.125,12.57329,1.0
40,18.5329,10.6346,1.0
45,14.7859,9.179762,1.0
50,14.7127,8.17099,1.0




TrainOutput(global_step=50, training_loss=32.83099918365478, metrics={'train_runtime': 2609.6892, 'train_samples_per_second': 0.307, 'train_steps_per_second': 0.019, 'total_flos': 1.0432123064322048e+17, 'train_loss': 32.83099918365478, 'epoch': 10.0})

In [16]:
trainer.push_to_hub()

'https://huggingface.co/Reyden/my_awesome_asr_mind_model/tree/main/'

In [22]:
from datasets import load_dataset, Audio

In [23]:
dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [24]:
sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]

In [39]:
import soundfile
inp = soundfile.read('output.wav')

In [42]:
from transformers import pipeline

transcriber = pipeline("automatic-speech-recognition", model="Reyden/main")
transcriber(inp[0])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'text': '<s>U<s>EU<s>CEU<s>EUCGEUEJG<unk>JEJ<s><unk>EUGWCEL<s>WBU<s>J<unk>EULJ<s>IJI<s>KISJ<s>U<s>J<s>UB<unk>UGJ<s><unk><s>E<s>EUL<s>EIEUW</s>JSJW<s>W<s>W<s>W<s>W<s>W<s>J<s>L<s>BW<s>UEU<s>BWEJWE<s>WEU<s>BEUW<s>ELWECEUGUWZEUBWZUEUBFLUG<s>E<s>E<s>GWFWFEUGWJUB<s>EWGUGJWFE<s>U<s>BWULBOFWUZL<s>U<s>ULUB<s>BW<s>W<s>W<s>UB<s>JE<s>WWCU<s>U<s>U<s>B<s>UBWEU<s>UELC<s>E<s>WUBSWGEJI'}