In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 7.3 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 9.0 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 58.1 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 67.7 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 58.3 MB/s 
Collecting multiprocess
  Downloading multiproce

## Importing Model and libs 
Wav2Vec2 model from huggingface transformers has been used.

In [2]:
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

## Loading Dataset 

*   Model is finetuned on librispeech_asr dataset
*   Only data for 4 users has been laoded to test pipeline. 
*   Dataset is loaded as stream since size of full training dataset is large. 

In [4]:
dataset_streamed = load_dataset("librispeech_asr", split="train.clean.100", streaming=True)
# next(iter(dataset_streamed))
dataset = list(dataset_streamed.take(463))
dataset[0]

Downloading builder script:   0%|          | 0.00/2.62k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]



{'audio': {'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
         -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
  'path': '374-180298-0000.flac',
  'sampling_rate': 16000},
 'chapter_id': 180298,
 'file': '374-180298-0000.flac',
 'id': '374-180298-0000',
 'speaker_id': 374,
 'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED'}

## Preparing Train and Test Dataset

In [6]:
# count of samples for each speaker along with speaker_id is extracted
speaker_id = []
for data in dataset:
  speaker_id.append(data['speaker_id'])
speaker_id_df = pd.DataFrame(speaker_id)
speaker_id_df.value_counts()

3240    127
7800    115
374     113
2514    108
dtype: int64

In [10]:
# available speaker_ids
speaker_id_df[0].unique()

array([ 374, 7800, 2514, 3240])

In [11]:
train_dataset = []
test_dataset = []
speaker_id2label = {}

for i, speaker_id in enumerate(speaker_id_df[0].unique()):
  speaker_id2label[speaker_id] = i+1
  j=0
  for data in dataset:
    if (data['speaker_id'] == speaker_id):
      if (j<100):
        train_dataset.append({'label':i+1, 'audio':data['audio']['array']})
      else:
        test_dataset.append({'label':i+1, 'audio':data['audio']['array']})
      j+=1

In [12]:
speaker_id2label

{374: 1, 2514: 3, 3240: 4, 7800: 2}

In [13]:
type(test_dataset[0]['audio'])

numpy.ndarray

In [14]:
print(f"""
Number of training samples: {len(train_dataset)}
Number of test samples: {len(test_dataset)}
""")


Number of training samples: 400
Number of test samples: 63



In [15]:
train_df = pd.DataFrame(train_dataset)
train_df.head()

Unnamed: 0,label,audio
0,1,"[0.000701904296875, 0.000732421875, 0.00073242..."
1,1,"[-9.1552734375e-05, -0.000152587890625, -0.000..."
2,1,"[-0.000244140625, -0.000244140625, -0.00018310..."
3,1,"[-0.000244140625, -0.000396728515625, -0.00057..."
4,1,"[0.000274658203125, 0.00030517578125, 0.000213..."


In [16]:
test_df = pd.DataFrame(test_dataset)
test_df.head()

Unnamed: 0,label,audio
0,1,"[0.0, -3.0517578125e-05, 0.0, -6.103515625e-05..."
1,1,"[-0.000457763671875, -0.00048828125, -0.000457..."
2,1,"[0.000579833984375, 0.0003662109375, 0.0001525..."
3,1,"[-0.000701904296875, -0.00054931640625, -0.000..."
4,1,"[9.1552734375e-05, 3.0517578125e-05, 9.1552734..."


## Loading pretrained Model and FeatureExtractor

In [17]:
checkpoint = "superb/wav2vec2-base-superb-sid"
model = Wav2Vec2ForSequenceClassification.from_pretrained(checkpoint)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(checkpoint)

Downloading config.json:   0%|          | 0.00/53.6k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading pytorch_model.bin:   0%|          | 0.00/362M [00:00<?, ?B/s]

Downloading preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

In [21]:
model.to(device)

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), strid

## Preparing DataLoader
*   DataLoader is prepared for both train and test dataset.
*   Help to easily iterate over data and load into required format to feed into model.
*   Custom collate function is used to do dynamic padding, padding such that length is made equal to longest sequence in batch.
*  This allows to reudce the memory requirement as compare to uniform padding.


 

In [18]:
def collate_batch(batch):
  audio_list = []
  label_list = []
  for (label, audio) in batch:
    audio_list.append(audio)
    label_list.append(label)

  audio_features = feature_extractor(audio_list, sampling_rate=16000, padding='longest', return_tensors="pt")
  encoding = {}
  encoding['input_values'] = audio_features['input_values']
  encoding['attention_mask'] = audio_features['attention_mask']
  encoding['labels'] = torch.from_numpy(np.array(label_list)).reshape((len(batch),))
  return encoding

In [19]:
from torch.utils.data import DataLoader

train_dataset = list(zip(train_df.label.values, train_df.audio.values))
test_dataset = list(zip(test_df.label.values, test_df.audio.values))

train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=8, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=8, collate_fn=collate_batch
)

In [31]:
# for data in train_dataloader:
  # print(data)
  # print(data['input_values'].shape)
  # print(data['attention_mask'].shape)
  # print(data['labels'].shape)
  # break

## Training

### Optimizer

In [22]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)



### Learning rate Scheduler

In [23]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

150


### Training loop
1. Fetching a batch
2. Forward Pass
3. Computing loss
4. Calculating gradients of params wrt loss
5. Updating weights using gradients
6. step of learning rate scheduler
7. Resetting gradients to zero to prevent accumulation 

In [24]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    print(f'Epoch {epoch}')
    # Fetching a batch
    for iter, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        # Forward Pass
        outputs = model(**batch)
        # Computing loss
        loss = outputs.loss
        print(f'loss: {loss}, iter: {iter+1}')
        # Calculating gradients of params wrt loss
        loss.backward()
        # Updating weights using gradients
        optimizer.step()
        # Step of learning rate scheduler
        lr_scheduler.step()
        # Resetting gradients to zero to prevent accumulation
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/150 [00:00<?, ?it/s]

Epoch 0
loss: 31.8498477935791, iter: 1
loss: 9.170082092285156, iter: 2
loss: 2.6650099754333496, iter: 3
loss: 2.290909767150879, iter: 4
loss: 3.310251474380493, iter: 5
loss: 4.723704814910889, iter: 6
loss: 0.04010586068034172, iter: 7
loss: 4.5175715058576316e-05, iter: 8
loss: 1.251634876098251e-05, iter: 9
loss: 0.3419243395328522, iter: 10
loss: 0.0006680600927211344, iter: 11
loss: 4.885741233825684, iter: 12
loss: 0.0013152705505490303, iter: 13
loss: 2.968977451324463, iter: 14
loss: 0.6133324503898621, iter: 15
loss: 2.190458189943456e-06, iter: 16
loss: 0.0, iter: 17
loss: 0.0, iter: 18
loss: 0.0, iter: 19
loss: 0.0, iter: 20
loss: 0.6269866228103638, iter: 21
loss: 0.0, iter: 22
loss: 0.0, iter: 23
loss: 0.0, iter: 24
loss: 0.0, iter: 25
loss: 0.0, iter: 26
loss: 0.0, iter: 27
loss: 4.470347647611561e-08, iter: 28
loss: 1.9371501025489124e-07, iter: 29
loss: 3.1739070891489973e-06, iter: 30
loss: 0.0, iter: 31
loss: 1.3411042232291948e-07, iter: 32
loss: 0.00218168413266

## Evaluation


In [25]:
predictions = []
labels = []

model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
      
    labels.append(batch['labels'].reshape(-1,).cpu().numpy())
    logits = outputs.logits
    predictions.append(torch.argmax(logits, dim=-1).reshape(-1,).cpu().numpy())

labels = np.concatenate(labels, axis=0)
predictions = np.concatenate(predictions, axis=0)
accuracy = (labels == predictions).sum()/len(predictions)
print(f"Accuracy: {accuracy*100}")

Accuracy: 100.0
