<a href="https://colab.research.google.com/github/PrakritiShetty/GSoC2024-Overhaul_of_AuToBI/blob/main/GSoC_Wav2Vec_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



*   Finetune Wav2Vec2 on the MInDS-14 dataset to classify speaker intent.
*   Use the finetuned model for inference.




In [None]:
pip install transformers datasets evaluate



In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load the dataset

In [None]:
from datasets import load_dataset, Audio

minds = load_dataset("PolyAI/minds14", name="en-US", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
minds = minds.train_test_split(test_size=0.2)

In [None]:
# visualise the dataset
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 450
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 113
    })
})

In [None]:
minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])

In [None]:
minds["train"]

Dataset({
    features: ['audio', 'intent_class'],
    num_rows: 450
})

In [None]:
minds['train'][449] # can check every datapoint from 0 to 449

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/en-US~BUSINESS_LOAN/602b987c963e11ccd901cb9f.wav',
  'array': array([ 0.        ,  0.        ,  0.00024414, ...,  0.00024414,
         -0.00024414,  0.00024414]),
  'sampling_rate': 8000},
 'intent_class': 5}

To make it easier for the model to get the label name from the label id:

In [None]:
labels = minds["train"].features["intent_class"]
labels
labels = labels.names
labels

['abroad',
 'address',
 'app_error',
 'atm_limit',
 'balance',
 'business_loan',
 'card_issues',
 'cash_deposit',
 'direct_debit',
 'freeze',
 'high_value_payment',
 'joint_account',
 'latest_transactions',
 'pay_bill']

In [None]:
label2id, id2label = dict(), dict()
for i,label in enumerate(labels):
  label2id[label] = str(i)
  id2label[str(i)] = label

In [None]:
id2label[str(0)]

'abroad'

# Preprocess the dataset

Feature Extractor

In [None]:
from transformers import AutoFeatureExtractor

# feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sd")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")




Sampling Rate

In [None]:
minds['train'].features
sampling_rate  = minds["train"].features["audio"].sampling_rate
sampling_rate

8000

In [None]:
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
dataset = minds['train']
dataset[0]

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/28aa727f91fee90575c34956bab09d1716cfaf460c6afcba86a10f04a7d58b83/en-US~HIGH_VALUE_PAYMENT/602b9e9d963e11ccd901cc28.wav',
  'array': array([-8.59331340e-06, -1.41524244e-04, -2.34635081e-04, ...,
          8.24356824e-03,  2.36976370e-02,  1.74179655e-02]),
  'sampling_rate': 16000},
 'intent_class': 10}

Preprocessing Function

In [None]:
feature_extractor.sampling_rate

16000

In [None]:
def preprocess_function(examples):

  audio_arrays = [x["array"] for x in examples["audio"]]

  inputs = feature_extractor(
      audio_arrays,
      sampling_rate = feature_extractor.sampling_rate,
      max_length = 16000,
      truncation = True
  )

  return inputs

Preprocess!

In [None]:
# to apply the preprocessing function over the entire dataset, use the map function
encoded_minds = minds.map(
    preprocess_function,
    remove_columns = "audio",
    batched = True
    )

encoded_minds = encoded_minds.rename_column("intent_class","label")

# encoded_minds["train"][0]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Evaluation metrics

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np

Evaluation function

In [None]:
def compute_metrics(eval_pred):
  predictions = np.argmax( eval_pred.predictions, axis = 1)
  return accuracy.compute(
      predictions = predictions,
      references = eval_pred.label_ids
  )

# Training

Training function

In [None]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels = num_labels,
    label2id = label2id,
    id2label = id2label,
    )

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training hyperparameters

In [None]:
! pip install accelerate -U



In [None]:
! pip install transformers[torch]



In [None]:
! pip install accelerate --upgrade



In [None]:
training_args = TrainingArguments(
    output_dir = "GSoC_wav2vec_DetectionOfIntonationalUnits",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 3e-5,
    per_device_train_batch_size = 32,
    gradient_accumulation_steps = 4,
    per_device_eval_batch_size = 32,
    num_train_epochs = 10,
    warmup_ratio = 0.1,
    logging_steps = 10,
    load_best_model_at_end = True,
    metric_for_best_model = "accuracy",
    push_to_hub = True
)



In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = encoded_minds["train"],
    eval_dataset = encoded_minds["test"],
    tokenizer = feature_extractor,
    compute_metrics = compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,No log,2.639764,0.070796
1,No log,2.640547,0.097345
2,2.634100,2.657977,0.053097
4,2.634100,2.660881,0.070796
5,2.616200,2.665167,0.070796
6,2.616200,2.665798,0.044248
8,2.613800,2.666549,0.044248


TrainOutput(global_step=30, training_loss=2.6213550567626953, metrics={'train_runtime': 5371.8028, 'train_samples_per_second': 0.838, 'train_steps_per_second': 0.006, 'total_flos': 3.26841433344e+16, 'train_loss': 2.6213550567626953, 'epoch': 8.0})

In [None]:
trainer.push_to_hub()

events.out.tfevents.1717825209.463da12eb84f.14749.0:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/prakritishetty/GSoC_wavLM_DetectionOfIntonationalUnits/commit/1bd02a404e9678ccc8bcdfadf5351b3e054b94b8', commit_message='End of training', commit_description='', oid='1bd02a404e9678ccc8bcdfadf5351b3e054b94b8', pr_url=None, pr_revision=None, pr_num=None)

Completed finetuning of the model!

# Inference

In [None]:
from datasets import load_dataset, Audio

dataset = load_dataset(
    "PolyAI/minds14",
    name = "en-US",
    split = "train"
    )
dataset = dataset.cast_column("audio", Audio(sampling_rate = 16000))

sampling_rate = dataset.features["audio"].sampling_rate
audio_file = dataset[0]["audio"]["path"]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [None]:
from transformers import pipeline

classifier = pipeline(
    "audio-classification",
    model = "GSoC_wavLM_DetectionOfIntonationalUnits",
)

classifier(audio_file)

[{'score': 0.08367979526519775, 'label': 'cash_deposit'},
 {'score': 0.07557357102632523, 'label': 'freeze'},
 {'score': 0.07528286427259445, 'label': 'app_error'},
 {'score': 0.07527675479650497, 'label': 'pay_bill'},
 {'score': 0.0749484971165657, 'label': 'direct_debit'}]

In [None]:
with torch.no_grad():
  logits = model(**inputs).logits

NameError: name 'torch' is not defined

In [None]:
import torch

predicted_class_ids = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label