In [6]:
# User input:
test_speaker = "M01"
ngram_order = 3
target_lang="en"
text_count_threshold = 40
model_user = "macarious"
model_repo = f"torgo_xlsr_finetune_{test_speaker}"
model_repo_path = f"{model_user}/{model_repo}"

kenlm_model_user = "macarious"
kenlm_model_repo = f"europarl_bilingual_kenlm_{ngram_order}-gram"
kenlm_model_repo_path= f"{kenlm_model_user}/{kenlm_model_repo}"
if ngram_order == 1:
  kenlm_model = ""
else:
  kenlm_model = f"output_model.klm_trigram_raw.bin"

In [7]:
import re
import torch
import zipfile
import pandas as pd

from huggingface_hub import Repository
from datasets import load_dataset, DatasetDict, Dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
from pyctcdecode import build_ctcdecoder
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from tqdm import tqdm
from evaluate import load
from datetime import datetime
from google.colab import files

In [8]:
# # Download the trained model
processor = Wav2Vec2Processor.from_pretrained(model_repo_path)
model = Wav2Vec2ForCTC.from_pretrained(model_repo_path)

In [9]:
lm_local_path = f"kenlm_model_{ngram_order}gram_words_europarl"
lm_repo = Repository(local_dir=lm_local_path, clone_from=kenlm_model_repo_path)
lm_repo.git_pull()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/macarious/europarl_bilingual_kenlm_3-gram into local empty directory.


Download file 3gram.bin:   0%|          | 3.45k/370M [00:00<?, ?B/s]

Clean file 3gram.bin:   0%|          | 1.00k/370M [00:00<?, ?B/s]

In [10]:
# Read the dataset
data_df = pd.read_csv('torgo.csv')
dataset_csv = load_dataset('csv', data_files='torgo.csv')

speakers = data_df['speaker_id'].unique()

print(f'Speakers: {", ".join(speakers)}')

Speakers: F01, F03, F04, FC01, FC02, FC03, M01, M02, M03, M04, M05, MC01, MC02, MC03, MC04


In [6]:
# Split data into train, valid, test sets
valid_speaker = 'F03' if test_speaker != 'F03' else 'F04'
train_speaker = [s for s in speakers if s not in [test_speaker, valid_speaker]]

torgo_dataset = DatasetDict()
torgo_dataset['train'] = dataset_csv['train'].filter(lambda x: x in train_speaker, input_columns=['speaker_id'])
torgo_dataset['validation'] = dataset_csv['train'].filter(lambda x: x == valid_speaker, input_columns=['speaker_id'])
torgo_dataset['test'] = dataset_csv['train'].filter(lambda x: x == test_speaker, input_columns=['speaker_id'])

torgo_dataset

Filter:   0%|          | 0/16394 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16394 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16394 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 14580
    })
    validation: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 1075
    })
    test: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 739
    })
})

In [7]:
# Count the number of times the text has been spoken in each of the 'train',
# 'validation', and 'test' sets. Remove text according to the
# text_count_threshold from a previous cell.
unique_texts = set(torgo_dataset['train'].unique(column='text')) | set(torgo_dataset['validation'].unique(column='text')) | set(torgo_dataset['test'].unique(column='text'))
unique_texts_count = {}

for text in unique_texts:
  unique_texts_count[text] = {'train_validation': 0, 'test': 0}

for text in torgo_dataset['train']['text']:
  unique_texts_count[text]['train_validation'] += 1

for text in torgo_dataset['validation']['text']:
  unique_texts_count[text]['train_validation'] += 1

for text in torgo_dataset['test']['text']:
  unique_texts_count[text]['test'] += 1

texts_to_keep_in_train_validation = []
texts_to_keep_in_test = []
for text in unique_texts_count:
  if unique_texts_count[text]['train_validation'] < text_count_threshold and unique_texts_count[text]['test'] > 0:
    texts_to_keep_in_test.append(text)
  else:
    texts_to_keep_in_train_validation.append(text)

original_data_count = {'train': len(torgo_dataset['train']), 'validation': len(torgo_dataset['validation']), 'test': len(torgo_dataset['test'])}

# Update the three dataset splits
torgo_dataset['train'] = torgo_dataset['train'].filter(lambda x: x['text'] in texts_to_keep_in_train_validation)
torgo_dataset['validation'] = torgo_dataset['validation'].filter(lambda x: x['text'] in texts_to_keep_in_train_validation)
torgo_dataset['test'] = torgo_dataset['test'].filter(lambda x: x['text'] in texts_to_keep_in_test)

print(f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
print(f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
print(f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)')

print()
torgo_dataset

Flattening the indices:   0%|          | 0/14580 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/1075 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/739 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14580 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1075 [00:00<?, ? examples/s]

Filter:   0%|          | 0/739 [00:00<?, ? examples/s]

Train:       8984/14580 (61%)
Validation:  582/1075 (54%)
Test:        519/739 (70%)



DatasetDict({
    train: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 8984
    })
    validation: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 582
    })
    test: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 519
    })
})

In [8]:
# Functions to process data:

# Remove special characters and convert all text into lowercase
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"0-9]'
sampling_rate=16000

def remove_special_characters(batch):
    batch['text'] = re.sub(chars_to_ignore_regex, ' ', batch['text']).lower()
    return batch

def prepare_torgo_dataset(batch):
    # Load audio data into batch
    audio = batch['audio']

    # Extract values
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    # Encode to label ids
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids

    return batch

In [9]:
def evaluateModel(processor, model, dataset, lm_model_path=None):

  predictions = []
  references = []

  if not lm_model_path:
    for i in tqdm(range(dataset.num_rows)):
      inputs = processor(dataset[i]["input_values"], sampling_rate=sampling_rate, return_tensors="pt")
      with torch.no_grad():
        logits = model(**inputs).logits
      predicted_ids = torch.argmax(logits, dim=-1)
      transcription = processor.batch_decode(predicted_ids)

      predictions.append(transcription[0].lower())
      references.append(dataset[i]["text"])

  else:
    vocab_dict = processor.tokenizer.get_vocab()
    sorted_vocab_dict = {k: v for k, v in sorted(
        vocab_dict.items(), key=lambda item: item[1])}

    unigrams = set()

    with open(f"{lm_local_path}/unigrams.txt", "r") as f:
      for line in f:
        line = line.strip()
        unigrams.add(line)

    # Implement language model in the decoder
    decoder = build_ctcdecoder(
        labels=list(sorted_vocab_dict.keys()),
        kenlm_model_path=lm_model_path if ngram_order > 1 else None,
        unigrams=unigrams
    )

    # Build new processor with new decoder
    processor = Wav2Vec2ProcessorWithLM(
        feature_extractor=processor.feature_extractor,
        tokenizer=processor.tokenizer,
        decoder=decoder
    )

    # Transcripe the audio
    for i in tqdm(range(dataset.num_rows)):
      inputs = processor(dataset[i]["input_values"], sampling_rate=sampling_rate, return_tensors="pt")
      with torch.no_grad():
        logits = model(**inputs).logits

      transcription = processor.batch_decode(logits.numpy()).text

      predictions.append(transcription[0].lower())
      references.append(dataset[i]["text"])

  # Calculate the wer score
  wer = load("wer")
  wer_score = wer.compute(predictions=predictions, references=references)

  return wer_score, predictions, references

In [10]:
torgo_test_set = torgo_dataset['test']
'''
  ******************** For debugging ********************
'''
# torgo_test_set = torgo_test_set.select(range(50))
'''
  ******************** For debugging ********************
'''

# Remove special characters
torgo_test_set = torgo_test_set.map(remove_special_characters)

# Filter audio within a certain length
torgo_test_set = torgo_test_set.cast_column("audio", Audio(sampling_rate=sampling_rate))
torgo_test_set = torgo_test_set.map(
  prepare_torgo_dataset,
  remove_columns=['session', 'audio', 'speaker_id'],
  num_proc=4)

min_input_length_in_sec = 1.0
max_input_length_in_sec = 10.0
torgo_test_set = torgo_test_set.filter(lambda x: x < max_input_length_in_sec * sampling_rate, input_columns=["input_length"])
torgo_test_set = torgo_test_set.filter(lambda x: x > min_input_length_in_sec * sampling_rate, input_columns=["input_length"])

print()
torgo_test_set

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/519 [00:00<?, ? examples/s]

Filter:   0%|          | 0/519 [00:00<?, ? examples/s]

Filter:   0%|          | 0/489 [00:00<?, ? examples/s]




Dataset({
    features: ['text', 'input_values', 'input_length', 'labels'],
    num_rows: 489
})

In [11]:
wer_score_no_lm, predictions_no_lm, references_no_lm = evaluateModel(processor, model, torgo_test_set)

print(f"WER (no LM): {wer_score_no_lm}")

100%|██████████| 489/489 [05:19<00:00,  1.53it/s]


WER (no LM): 0.8779201205727204


In [12]:
wer_score_lm, predictions_lm, references_lm = evaluateModel(processor, model, torgo_test_set, f"{lm_local_path}/{kenlm_model}")

print(f"WER ({ngram_order}-gram): {wer_score_lm}")

Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Unigrams and labels don't seem to agree.
Only 45 unigrams passed as vocabulary. Is this small or artificial data?
100%|██████████| 489/489 [26:01<00:00,  3.19s/it]


WER (3-gram): 0.9140919366993218


In [14]:
unigrams = set()

with open(f"{lm_local_path}/unigrams.txt", "r") as f:
  for line in f:
    line = line.strip()
    unigrams.add(line)

print(len(set("".join(unigrams))))
print(set("".join(unigrams)))
print(unigrams)

30
{'/', 'F', 'U', 'V', '_', 'P', 'D', 'S', 'G', 'C', 'B', 'H', 'Z', 'W', 'L', 'R', 'K', 'A', '>', 'M', '<', 's', 'J', 'I', 'N', 'E', 'O', 'Y', ',', 'T'}
{'JH', '_T', 'F', 'UH', 'AW', 'AA', 'V', '_', 'CH', 'IH', '_S', 'P', 'D', 'IY', '</s>', 'AH', 'S', 'TH', 'UW', '<s>', 'G', 'EY', 'ZH', 'HH', 'DH', 'B', 'NG', 'AY', 'L', 'W', 'Z', 'R', 'SH', 'K', 'OW', 'EH', 'M', 'N', 'AO', 'AE', 'ER', 'Y', ',', 'T', 'OY'}


In [15]:
import csv

# Save results to a csv file
with open(f"results_{ngram_order}gram_{test_speaker}.txt", "w") as csv_file:
  csv_writer = csv.writer(csv_file)
  csv_writer.writerow(["Prediction (no LM)", f"Prediction ({ngram_order}-gram)", "Reference"])
  for i in range(len(predictions_no_lm)):
    csv_writer.writerow([predictions_no_lm[i], predictions_lm[i], references_lm[i]])

# Display as dataframe
results_df = pd.read_csv(f"results_{ngram_order}gram_{test_speaker}.txt")
results_df.head(20)

Unnamed: 0,Prediction (no LM),Prediction (3-gram),Reference
0,traite,traite,trait
1,trabel,trabl,trouble
2,feer,feer,fee
3,dresbuy rig balu niveringwat,dresbuly rigebalnivrin gweate,grandfather likes to be modern in his language
4,rade,rade,raid
5,a glib inding the wandor all oure inaatde,aglib indingthe wandor oure aade,except in the winter when the ooze or snow or ...
6,a ung floing be clam destin,aungflowing stin,a long flowing beard clings to his chin
7,feer,feer,fair
8,tick,tick,tip
9,bable,bable,bubble


In [None]:
# Save wer to a csv file
with open(f"wer_{ngram_order}gram_{test_speaker}.txt", "w") as csv_file:
  csv_writer = csv.writer(csv_file)
  csv_writer.writerow(["Language Model", "WER"])
  csv_writer.writerow(["None", wer_score_no_lm])
  csv_writer.writerow([f"{ngram_order}-gram", wer_score_lm])

# Display as dataframe
results_wer_df = pd.read_csv(f"wer_{ngram_order}gram_{test_speaker}.txt")
results_wer_df.head(20)

In [None]:
# Create a string of current date
current_date = datetime.now().strftime("%Y-%m-%d")

# Zip the results into a single file for download
output_zip_path = f"results_with_LM_{test_speaker}_{current_date}.zip"
with zipfile.ZipFile(output_zip_path, "w") as zip_file:
  zip_file.write(f"results_{ngram_order}gram_{test_speaker}.txt")
  zip_file.write(f"wer_{ngram_order}gram_{test_speaker}.txt")

files.download(output_zip_path)