In [1]:
# User input:
test_speaker = "F01"
ngram_order = 3
target_lang="en"
text_count_threshold = 40
model_user = "macarious"
model_repo = f"torgo_xlsr_finetune_{test_speaker}"
model_repo_path = f"{model_user}/{model_repo}"

kenlm_model_user = "macarious"
kenlm_model_repo = f"europarl_bilingual_kenlm_{ngram_order}-gram"
kenlm_model_repo_path= f"{kenlm_model_user}/{kenlm_model_repo}"
if ngram_order == 1:
  kenlm_model = ""
else:
  kenlm_model = f"{ngram_order}gram.bin"

In [2]:
%run 30_eval_common.ipynb

In [3]:
import re
import torch
import zipfile
import pandas as pd

from huggingface_hub import Repository
from datasets import load_dataset, DatasetDict, Dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
from pyctcdecode import build_ctcdecoder
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from tqdm import tqdm
from evaluate import load
from datetime import datetime
from google.colab import files

  from IPython.utils import traitlets as _traitlets


In [4]:
# # Download the trained model
processor = Wav2Vec2Processor.from_pretrained(model_repo_path)
model = Wav2Vec2ForCTC.from_pretrained(model_repo_path)

In [5]:
lm_local_path = f"kenlm_model_{ngram_order}gram_words_europarl"
# lm_repo = Repository(local_dir=lm_local_path, clone_from=kenlm_model_repo_path)
# lm_repo.git_pull()

In [6]:
# Read the dataset
data_df = pd.read_csv('torgo.csv')
dataset_csv = load_dataset('csv', data_files='torgo.csv')

speakers = data_df['speaker_id'].unique()

print(f'Speakers: {", ".join(speakers)}')

Speakers: F01, F03, F04, FC01, FC02, FC03, M01, M02, M03, M04, M05, MC01, MC02, MC03, MC04


In [7]:
# Split data into train, valid, test sets
valid_speaker = 'F03' if test_speaker != 'F03' else 'F04'
train_speaker = [s for s in speakers if s not in [test_speaker, valid_speaker]]

torgo_dataset = DatasetDict()
torgo_dataset['train'] = dataset_csv['train'].filter(lambda x: x in train_speaker, input_columns=['speaker_id'])
torgo_dataset['validation'] = dataset_csv['train'].filter(lambda x: x == valid_speaker, input_columns=['speaker_id'])
torgo_dataset['test'] = dataset_csv['train'].filter(lambda x: x == test_speaker, input_columns=['speaker_id'])

torgo_dataset

DatasetDict({
    train: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 15091
    })
    validation: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 1075
    })
    test: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 228
    })
})

In [8]:
# Count the number of times the text has been spoken in each of the 'train',
# 'validation', and 'test' sets. Remove text according to the
# text_count_threshold from a previous cell.
unique_texts = set(torgo_dataset['train'].unique(column='text')) | set(torgo_dataset['validation'].unique(column='text')) | set(torgo_dataset['test'].unique(column='text'))
unique_texts_count = {}

for text in unique_texts:
  unique_texts_count[text] = {'train_validation': 0, 'test': 0}

for text in torgo_dataset['train']['text']:
  unique_texts_count[text]['train_validation'] += 1

for text in torgo_dataset['validation']['text']:
  unique_texts_count[text]['train_validation'] += 1

for text in torgo_dataset['test']['text']:
  unique_texts_count[text]['test'] += 1

texts_to_keep_in_train_validation = []
texts_to_keep_in_test = []
for text in unique_texts_count:
  if unique_texts_count[text]['train_validation'] < text_count_threshold and unique_texts_count[text]['test'] > 0:
    texts_to_keep_in_test.append(text)
  else:
    texts_to_keep_in_train_validation.append(text)

original_data_count = {'train': len(torgo_dataset['train']), 'validation': len(torgo_dataset['validation']), 'test': len(torgo_dataset['test'])}

# Update the three dataset splits
torgo_dataset['train'] = torgo_dataset['train'].filter(lambda x: x['text'] in texts_to_keep_in_train_validation)
torgo_dataset['validation'] = torgo_dataset['validation'].filter(lambda x: x['text'] in texts_to_keep_in_train_validation)
torgo_dataset['test'] = torgo_dataset['test'].filter(lambda x: x['text'] in texts_to_keep_in_test)

print(f'Train:       {len(torgo_dataset["train"])}/{original_data_count["train"]} ({len(torgo_dataset["train"]) * 100 // original_data_count["train"]}%)')
print(f'Validation:  {len(torgo_dataset["validation"])}/{original_data_count["validation"]} ({len(torgo_dataset["validation"]) * 100 // original_data_count["validation"]}%)')
print(f'Test:        {len(torgo_dataset["test"])}/{original_data_count["test"]} ({len(torgo_dataset["test"]) * 100 // original_data_count["test"]}%)')

print()
torgo_dataset

Filter:   0%|          | 0/15091 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1075 [00:00<?, ? examples/s]

Filter:   0%|          | 0/228 [00:00<?, ? examples/s]

Train:       13529/15091 (89%)
Validation:  957/1075 (89%)
Test:        134/228 (58%)



DatasetDict({
    train: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 13529
    })
    validation: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 957
    })
    test: Dataset({
        features: ['session', 'audio', 'text', 'speaker_id'],
        num_rows: 134
    })
})

In [9]:
# Functions to process data:

# Remove special characters and convert all text into lowercase
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"0-9]'
sampling_rate=16000

def remove_special_characters(batch):
    batch['text'] = re.sub(chars_to_ignore_regex, ' ', batch['text']).lower()
    return batch

def prepare_torgo_dataset(batch):
    # Load audio data into batch
    audio = batch['audio']

    # Extract values
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    # Encode to label ids
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids

    return batch

In [10]:
def evaluateModel(processor, model, dataset, lm_model_path=None):

  predictions = []
  references = []

  if not lm_model_path:
    for i in tqdm(range(dataset.num_rows)):
      inputs = processor(dataset[i]["input_values"], sampling_rate=sampling_rate, return_tensors="pt")
      with torch.no_grad():
        logits = model(**inputs).logits
      predicted_ids = torch.argmax(logits, dim=-1)
      transcription = processor.batch_decode(predicted_ids)

      predictions.append(transcription[0].lower())
      references.append(dataset[i]["text"])

  else:
    vocab_dict = processor.tokenizer.get_vocab()
    sorted_vocab_dict = {k: v for k, v in sorted(
        vocab_dict.items(), key=lambda item: item[1])}

    unigrams = set()

    with open(f"{lm_local_path}/unigrams.txt", "r") as f:
      for line in f:
        line = line.strip()
        unigrams.add(line)

    # Implement language model in the decoder
    decoder = build_ctcdecoder(
        labels=list(sorted_vocab_dict.keys()),
        kenlm_model_path=lm_model_path if ngram_order > 1 else None,
        unigrams=unigrams
    )

    # Build new processor with new decoder
    processor = Wav2Vec2ProcessorWithLM(
        feature_extractor=processor.feature_extractor,
        tokenizer=processor.tokenizer,
        decoder=decoder
    )

    # Transcripe the audio
    for i in tqdm(range(dataset.num_rows)):
      inputs = processor(dataset[i]["input_values"], sampling_rate=sampling_rate, return_tensors="pt")
      with torch.no_grad():
        logits = model(**inputs).logits

      transcription = processor.batch_decode(logits.numpy()).text

      predictions.append(transcription[0].lower())
      references.append(dataset[i]["text"])

  # Calculate the wer score
  wer = load("wer")
  wer_score = wer.compute(predictions=predictions, references=references)

  return wer_score, predictions, references

In [11]:
torgo_test_set = torgo_dataset['test']
'''
  ******************** For debugging ********************
'''
# torgo_test_set = torgo_test_set.select(range(50))
'''
  ******************** For debugging ********************
'''

# Remove special characters
torgo_test_set = torgo_test_set.map(remove_special_characters)
level='sentence'
if level == 'sentence':
    torgo_test_set = torgo_test_set.filter(lambda example: len(example["text"].split()) > 1)
else:
    torgo_test_set = torgo_test_set.filter(lambda example: len(example["text"].split()) == 1)

# Filter audio within a certain length
torgo_test_set = torgo_test_set.cast_column("audio", Audio(sampling_rate=sampling_rate))
torgo_test_set = torgo_test_set.map(
  prepare_torgo_dataset,
  remove_columns=['session', 'audio', 'speaker_id'],
  num_proc=4)

min_input_length_in_sec = 1.0
max_input_length_in_sec = 10.0
torgo_test_set = torgo_test_set.filter(lambda x: x < max_input_length_in_sec * sampling_rate, input_columns=["input_length"])
torgo_test_set = torgo_test_set.filter(lambda x: x > min_input_length_in_sec * sampling_rate, input_columns=["input_length"])

print()
torgo_test_set

Map:   0%|          | 0/134 [00:00<?, ? examples/s]

Filter:   0%|          | 0/134 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/40 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36 [00:00<?, ? examples/s]




Dataset({
    features: ['text', 'input_values', 'input_length', 'labels'],
    num_rows: 36
})

In [12]:
wer_score_no_lm, predictions_no_lm, references_no_lm = evaluateModel(processor, model, torgo_test_set)

print(f"WER (no LM): {wer_score_no_lm}")

100%|██████████| 36/36 [38:04<00:00, 63.45s/it]


WER (no LM): 0.7515723270440252


In [13]:
wer_score_lm, predictions_lm, references_lm = evaluateModel(processor, model, torgo_test_set, f"{lm_local_path}/{kenlm_model}")

print(f"WER ({ngram_order}-gram): {wer_score_lm}")

Found entries of length > 1 in alphabet. This is unusual unless style is BPE, but the alphabet was not recognized as BPE type. Is this correct?
Unigrams and labels don't seem to agree.
100%|██████████| 36/36 [48:53<00:00, 81.50s/it]


WER (3-gram): 0.7044025157232704


In [14]:
unigrams = set()

with open(f"{lm_local_path}/unigrams.txt", "r") as f:
  for line in f:
    line = line.strip()
    unigrams.add(line)

print(len(set("".join(unigrams))))
print(set("".join(unigrams)))
print(unigrams)

213
{'ś', 'ı', 'ν', 'ї', 'è', 'µ', 'у', 'ñ', 's', '=', 'ą', '¹', 'g', 'à', '*', '&', 'с', 'λ', 'ь', 'ā', '8', 'η', '―', ')', '´', 'ü', '>', 'å', 'î', 'u', 'ī', '(', 'n', 'ū', 'χ', 'ē', 'ŵ', 'ź', 'þ', 'φ', 'e', 'í', '\xa0', 'υ', 'ç', 'ě', '«', '1', 'ķ', 'о', 'μ', 'k', '/', 'й', '÷', '5', 'j', 'ô', '0', 'ά', 'ņ', 'ń', 'к', '@', 'd', 'я', 'ù', 'ч', 'i', 'ώ', '|', '³', 'ι', '9', 'т', '°', 'o', 'ð', '½', 'α', 'q', 'р', '̇', '·', 'é', 'r', 'c', 'ž', 'æ', 'і', 'π', '²', '\u200b', 'ή', 'ø', 'б', 'b', 'y', 'ό', 'ò', '+', 'ǐ', '\\', 'ș', 'ň', 'á', '€', 't', 'ĉ', 'ï', 'ю', 'з', 'ß', '¾', 'f', 'º', 'ý', 'л', 'ģ', 'н', 'ʼ', 'ć', 'ų', 'm', 'и', 'đ', 'κ', 'ǎ', 'ы', 'p', 'δ', "'", 'д', 'ш', 'ö', 'а', '•', 'ê', 'ó', 'м', 'ť', 'ж', '2', 'l', 'ő', 'ǔ', 'h', 'ä', 'ė', '£', '7', 'ú', 'ů', 'г', '4', 'τ', 'ë', 'ż', '»', 'β', '¼', 'ă', 'a', 'ř', 'γ', 'ţ', 'č', 'û', 'п', 'v', 'ο', 'â', '©', '¡', 'ę', '3', 'ъ', 'σ', 'œ', '−', '6', 'ί', 'є', 'ō', 'ł', 'е', 'ε', '<', 'š', 'ã', '_', 'έ', 'õ', '§', 'ş', '\xad', 'ω'

In [15]:
import csv

# Save results to a csv file
with open(f"results_{ngram_order}gram_{test_speaker}.txt", "w") as csv_file:
  csv_writer = csv.writer(csv_file)
  csv_writer.writerow(["Prediction (no LM)", f"Prediction ({ngram_order}-gram)", "Reference"])
  for i in range(len(predictions_no_lm)):
    csv_writer.writerow([predictions_no_lm[i], predictions_lm[i], references_lm[i]])

# Display as dataframe
results_df = pd.read_csv(f"results_{ngram_order}gram_{test_speaker}.txt")
results_df.head(20)

Unnamed: 0,Prediction (no LM),Prediction (3-gram),Reference
0,we lowly cakes alt walth l open ir each day,we slowly cakes all wealth open air each day,he slowly takes a short walk in the open air e...
1,my selald bupins,my sealed bains,usually minus several buttons
2,i'm whisking all all about my yong boly,i'm whisking all all about my young boy,you wished to know all about my grandfather
3,he alway ounces hoil,he always ounces oil,but he always answers banana oil
4,bt yo'll oww ampty,but you'll owe empty all,but he always answers banana oil
5,the quick house fa junsle wer e laly da,the quick house unclear lay a,the quick brown fox jumps over the lazy dog
6,the quick blown folt gun wvlasing ol,the quick blown fat gun lading all,the quick brown fox jumps over the lazy dog
7,you have u that e so wall folder all years,you have that we wall holder all year,she had your dark suit in greasy wash water al...
8,when will algools wre slowly and swokus,when will all goals were slowly and sous,we have often urged him to walk more and smoke...
9,all for wy mare cleaim en kain,all for we made claim in in,a long flowing beard clings to his chin


In [16]:
# Save wer to a csv file
with open(f"wer_{ngram_order}gram_{test_speaker}.txt", "w") as csv_file:
  csv_writer = csv.writer(csv_file)
  csv_writer.writerow(["Language Model", "WER"])
  csv_writer.writerow(["None", wer_score_no_lm])
  csv_writer.writerow([f"{ngram_order}-gram", wer_score_lm])

# Display as dataframe
results_wer_df = pd.read_csv(f"wer_{ngram_order}gram_{test_speaker}.txt")
results_wer_df.head(20)

Unnamed: 0,Language Model,WER
0,,0.751572
1,3-gram,0.704403


In [None]:
# Create a string of current date
current_date = datetime.now().strftime("%Y-%m-%d")

# Zip the results into a single file for download
output_zip_path = f"results_with_LM_{test_speaker}_{current_date}.zip"
with zipfile.ZipFile(output_zip_path, "w") as zip_file:
  zip_file.write(f"results_{ngram_order}gram_{test_speaker}.txt")
  zip_file.write(f"wer_{ngram_order}gram_{test_speaker}.txt")

files.download(output_zip_path)