### Login to Hugging Face

In [1]:
# !pip install huggingface-hub

!git config --global user.email "jindaz.work@outlook.com"
!git config --global user.name "jindaxzillusion"

## Process and Extract Data

Data is mostly already pre-processed. Here I will load in the dataset from hugging face to get the english queries. Then I will create a file and write to it line by line the sentences from the dataset in their phonetic representations so each line will have the phonemes from the query. For example "Today is June 18th and it is Muiriel's birthday!" will be
"Phonemes: ['T', 'AH', 'D', 'EY', 'IH', 'Z', 'JH', 'UW', 'N', 'AH', 'N', 'D', 'IH', 'T', 'IH', 'Z', "'EH", 'S', 'B', 'ER', 'TH', 'D', 'EY', '!']


In [2]:
# !pip install num2words
from datasets import load_dataset
#from g2p_en import G2p
from g2p import make_g2p
from num2words import num2words  # Import num2words from nltk
import re

# Load the Tatoeba dataset
tatoeba_dataset = load_dataset("tatoeba", 'en-mr')

# Initialize g2p converter
transducer = make_g2p('eng', 'eng-arpabet')



Downloading data:   0%|          | 0.00/2.57M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/53462 [00:00<?, ? examples/s]

In [None]:
# Function to convert numbers to words
def convert_numbers_to_words(text):
    words = []
    for word in text.split():
        if word.isdigit():
            # Convert numbers to words
            word = num2words(word)
        elif re.match(r'\d+(st|nd|rd|th)', word):
            # Handle ordinal numbers
            number_part = re.match(r'\d+', word).group()
            ordinal_suffix = re.search(r'(st|nd|rd|th)', word).group()
            word = num2words(number_part, ordinal=True) + ordinal_suffix
        words.append(word)
    return ' '.join(words)

def remove_punctuation_and_special_characters(text):
    # Remove punctuation and special characters
    return re.sub(r'[^\w\s,-]', '', text)

translations = tatoeba_dataset["train"]["translation"]
for translation in translations[:3]:
  sentence = translation["en"]
  sentence = remove_punctuation_and_special_characters(sentence)
  sentence = convert_numbers_to_words(sentence)
  phonemes_list = [transducer(word).output_string for word in re.findall(r'\S+', sentence)]
  all_phonemes = [item for sublist in phonemes_list for item in sublist.split()]
  # Replace hyphens with letters that come after the hyphen
  all_phonemes_no_hyphen = [phoneme.split("-")[-1] if "-" in phoneme else phoneme for phoneme in all_phonemes]
  print(all_phonemes_no_hyphen)

unique_phonemes = set()

with open("dataset_phonemes.txt", "w") as file:
    for translation in translations:
      sentence = translation["en"]
      sentence = remove_punctuation_and_special_characters(sentence)
      sentence = convert_numbers_to_words(sentence)
      phonemes_list = [transducer(word).output_string for word in re.findall(r'\S+', sentence)]
      all_phonemes = [item for sublist in phonemes_list for item in sublist.split()]

      # Update the set of unique phonemes excluding those containing hyphens
      unique_phonemes.update(phoneme for phoneme in all_phonemes if "-" not in phoneme)

      # Replace hyphens with letters that come after the hyphen
      all_phonemes_no_hyphen = [phoneme.split("-")[-1] if "-" in phoneme else phoneme for phoneme in all_phonemes]

      # Write all individual phonemes (without hyphens) to the file
      file.write(" ".join(all_phonemes_no_hyphen))
      file.write("\n")

# Print the vocabulary of phonemes
print("Phoneme Vocabulary:")
print(unique_phonemes)

['T', 'AH', 'D', 'EY', 'IH', 'Z', 'JH', 'UW', 'N', 'AH', 'N', 'D', 'IH', 'T', 'IH', 'Z', 'B', 'ER', 'TH', 'D', 'EY']
['IH', 'Z', 'T', 'W', 'EH', 'N', 'T', 'IY', 'N', 'AW']
['DH', 'AH', 'P', 'AE', 'S', 'W', 'ER', 'D', 'IH', 'Z']
Phoneme Vocabulary:
{'W', 'OW', 'AW', 'UH', 'R', 'Y', 'G', 'S', 'IY', 'NG', 'AA', 'V', ',', ',,', 'DH', 'TH', 'JH', 'D', 'K', 'M', 'F', 'HH', ',,,', 'L', 'Z', 'AO', 'N', 'SH', 'CH', 'T', 'AE', 'ER', 'IH', 'AH', 'AY', 'B', 'OY', 'EY', 'UW', 'P', 'ZH', 'EH'}


In [None]:
# Read the content from the file
with open("/content/dataset_phonemes.txt", 'r') as file:
    content = file.read()

# Remove instances of consecutive triple commas (,,,)
content = content.replace(',,,', '')

# Remove instances of consecutive double commas (,,)
content = content.replace(',,', '')

# Remove individual commas
content = content.replace(',', '')
# Write the modified content back to the file
with open('dataset_phonemes.txt', 'w') as file:
    file.write(content)

## Build ngrams

With the dataset.txt file created I will use that to now train unigram and trigram models utilizing kenML.


In [None]:
import os
from huggingface_hub import Repository


input_dataset = "/content/dataset_phonemes.txt"
output_directory = "/content/output_directory/"
output_model_base = "output_model.klm"

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Ngram size options
order_trigram = 3%
order_unigram = 1

trigram_repo = Repository(local_dir="output_model.klm_trigram", clone_from="laurynflu/output_model.klm_trigram")

# Train trigram model
!/content/kenlm/build/bin/lmplz -o {order_trigram} --text /content/dataset_phonemes.txt --arpa /content/output_directory/output_model.klm_trigram.arpa --discount_fallback --skip_symbols| \
    /content/kenlm/build/bin/build_binary -T /dev/stdin /content/output_directory/output_model.klm_trigram.arpa
print("Training complete - trigram")

# # Train unigram model - will need to look into the issue with kenML not performing for unigram models
# !/content/kenlm/build/bin/lmplz -o {order_unigram} --text /content/dataset_phonemes.txt --arpa /content/output_directory/output_model.klm_unigram.arpa --interpolate_unigrams 1 --discount_fallback | \
#     /content/kenlm/build/bin/build_binary -T /dev/stdin /content/output_directory/output_model.klm_unigram.arpa
# print("Training complete - unigram")


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/content/output_directory/output_model.klm_trigram is already a clone of https://huggingface.co/laurynflu/output_model.klm_trigram. Make sure you pull the latest changes with `repo.git_pull()`.


=== 1/5 Counting and sorting n-grams ===
Reading /content/dataset_phonemes.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
End of file
ERROR
****************************************************************************************************
Unigram tokens 908424 types 42
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:504 2:3788039936 3:7102575104
Substituting fallback discounts for order 0: D1=0.5 D2=1 D3+=1.5
Statistics:
1 42 D1=0.5 D2=1 D3+=1.5
2 1280 D1=0.41791 D2=1.00344 D3+=1.13962
3 16435 D1=0.435338 D2=1.14613 D3+=1.62571
Memory estimate for binary LM:
type     kB
probing 319 assuming -p 1.5
probing 327 assuming -r models -p 1.5
trie     88 without quantization
trie     37 assuming -q 8 -b 8 quantization 
trie     87 assuming -a 22 array pointer compression
trie     37 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain siz

In [None]:
!/content/kenlm/build/bin/build_binary /content/output_directory/output_model.klm_trigram.arpa /content/output_directory/output_model.klm_trigram.bin

Reading /content/output_directory/output_model.klm_trigram.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
/content/kenlm/lm/read_arpa.hh:51 in void lm::Read1Gram(util::FilePiece&, Voc&, Weights*, lm::PositiveProbWarn&) [with Voc = lm::ngram::ProbingVocabulary; Weights = lm::ProbBackoff] threw FormatLoadException because `f.get() != '\t''.
Expected tab after probability in the 1-gram at byte 64 Byte: 64
ERROR


In [None]:
def get_vocabulary_from_arpa(arpa_path):
    vocabulary = set()

    with open(arpa_path, 'r', encoding='utf-8') as arpa_file:
        in_data_section = False

        for line in arpa_file:
            line = line.strip()

            if line == '\\data\\':
                in_data_section = True
            elif line.startswith('\\') and in_data_section:
                break  # End of data section
            elif not line.startswith('\\') and in_data_section:
                # Inside the n-gram section, extract vocabulary
                parts = line.split()
                if len(parts) > 1:
                    word = parts[1]
                    vocabulary.add(word)

    return vocabulary

# Replace with the actual path to your KenLM ARPA file
arpa_path = "/content/output_directory/output_model.klm_trigram.arpa"
vocabulary = get_vocabulary_from_arpa(arpa_path)

# Print or use the vocabulary as needed
print("Vocabulary:", vocabulary)


Vocabulary: {'2=1280', '3=16435', '1=42'}


In [None]:
def add_tokens_to_arpa(arpa_path, tokens_to_add, tokens_to_exclude):
    with open(arpa_path, 'r', encoding='utf-8') as arpa_file:
        arpa_content = arpa_file.readlines()

    # Find the index where the unigram section starts
    start_index = arpa_content.index('\\1-grams:\n') + 1

    # Insert entries for the new tokens excluding the ones to exclude
    for token in tokens_to_add:
        if token not in tokens_to_exclude:
            arpa_content.insert(start_index, f'-99.999 {token} -99.999\n')

    # Write the modified content back to the ARPA file
    with open(arpa_path, 'w', encoding='utf-8') as arpa_file:
        arpa_file.writelines(arpa_content)

# Replace with the actual path to your KenLM ARPA file
arpa_path = "/content/output_directory/output_model.klm_trigram.arpa"

# Tokens to add
new_tokens = ["<pad>", "<sil>", "<spn>"]

# Tokens to exclude
tokens_to_exclude = ["<s>", "</s>"]

# Add tokens to the ARPA file excluding the specified tokens
add_tokens_to_arpa(arpa_path, new_tokens, tokens_to_exclude)



In [None]:
# Replace with the actual path to your KenLM ARPA file
arpa_path = "/content/output_directory/output_model.klm_trigram.arpa"
vocabulary = get_vocabulary_from_arpa(arpa_path)

# Print or use the vocabulary as needed
print("Vocabulary:", vocabulary)

Vocabulary: {'2=1280', '3=16435', '1=42'}


## Save and push ngram to hugging face

In [None]:
# Debugging: Print the current working directory
print("Current working directory:", os.getcwd())
output_directory = "/content/output_directory/"

# Change the working directory
os.chdir(output_directory)

# Verify the change
new_dir = os.getcwd()
print("New working directory:", new_dir)


# Debugging: Print the contents of the local directory
print("Contents of the local directory:", os.listdir("."))

# Push to Hugging Face
trigram_repo.push_to_hub(commit_message="Create trigram model with kenLM")

Current working directory: /content
New working directory: /content/output_directory
Contents of the local directory: ['output_model.klm_trigram.bin', 'output_model.klm_trigram.arpa', '.ipynb_checkpoints']


In [None]:
content_directory = "/content/"
# Change the working directory
os.chdir(content_directory)
# Verify the change
new_dir = os.getcwd()
print("New working directory:", new_dir)

New working directory: /content


# Language Model Implementation with ASR model

In [None]:
model_user = "Aanchan"
model_repo = "psst_model_cer_2"
model_repo_path = f"{model_user}/{model_repo}"

# kenlm language model
kenlm_model_trigram_user = "laurynflu"
kenlm_model_trigram_repo = "output_model.klm_trigram"
kenlm_model_trigram_repo_path= f"{kenlm_model_trigram_user}/{kenlm_model_trigram_repo}"
kenlm_model_trigram_file = "output_model.klm_trigram_debug"+".arpa"

## Install Packages

In [None]:
%%capture
!pip install jiwer
!pip install evaluate # Sometimes, importing from `evaluate` fails if this package is installed in a different order relative to the other packages.
!apt install git-lfs
!pip install pyctcdecode
!pip install torch

In [None]:
import torch
import zipfile

from datasets import DatasetDict, Dataset, Audio
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
from pyctcdecode import build_ctcdecoder
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from tqdm import tqdm
from evaluate import load

## Download the Models

In [None]:
# Download the trained model

repo = Repository(local_dir="model_staging", clone_from=model_repo_path)
repo.git_pull()

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Aanchan/psst_model_cer_2 into local empty directory.


Download file pytorch_model.bin:   0%|          | 3.40k/360M [00:00<?, ?B/s]

Download file training_args.bin: 100%|##########| 3.43k/3.43k [00:00<?, ?B/s]

Clean file training_args.bin:  29%|##9       | 1.00k/3.43k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/360M [00:00<?, ?B/s]

In [None]:
# Download the language models from huggingface

lm_local_trigram = "kenlm_model_trigram"
lm_repo_trigram = Repository(local_dir=lm_local_trigram, clone_from=kenlm_model_trigram_repo_path)
lm_repo_trigram.git_pull()
trigram_model_path = f"/content/{lm_local_trigram}/{kenlm_model_trigram_file}"

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/laurynflu/output_model.klm_trigram into local empty directory.


In [None]:
# Functions to process data:


chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
sampling_rate=16000

def remove_special_characters(batch):
  batch["transcript"] = re.sub(chars_to_ignore_regex, '', batch["transcript"]).lower() + " "
  batch["transcript"] = re.sub(chars_to_ignore_regex, '', batch["transcript"]).lower()
  return batch

def prepare_dataset(batch):
  audio = batch["filename"]
  batch["input_values"] = processor(audio["array"], sampling_rate=sampling_rate).input_values[0]
  batch["input_length"] = len(batch["input_values"])

  with processor.as_target_processor():
    batch["labels"] = processor(batch["transcript"]).input_ids

  return batch

In [None]:
def evaluateModel(processor, model, dataset, lm_model_path=None):

  predictions = []
  references = []

  if not lm_model_path:
    for i in tqdm(range(dataset.num_rows)):
      inputs = processor(dataset[i]["input_values"], sampling_rate=sampling_rate, return_tensors="pt")
      with torch.no_grad():
        logits = model(**inputs).logits
      predicted_ids = torch.argmax(logits, dim=-1)
      transcription = processor.batch_decode(predicted_ids)

      predictions.append(transcription[0].lower())
      references.append(dataset[i]["transcript"])

  else:
    vocab_dict = processor.tokenizer.get_vocab()
    sorted_vocab_dict = {k: v for k, v in sorted(
        vocab_dict.items(), key=lambda item: item[1])}

    # Implement language model in the decoder
    decoder = build_ctcdecoder(
        labels=list(sorted_vocab_dict.keys()),
        kenlm_model_path=lm_model_path,
    )

    # Build new processor with new decoder
    processor = Wav2Vec2ProcessorWithLM(
        feature_extractor=processor.feature_extractor,
        tokenizer=processor.tokenizer,
        decoder=decoder
    )

    # Transcripe the audio
    for i in tqdm(range(dataset.num_rows)):
      inputs = processor(dataset[i]["input_values"], sampling_rate=sampling_rate, return_tensors="pt")
      with torch.no_grad():
        logits = model(**inputs).logits

      transcription = processor.batch_decode(logits.numpy()).text

      predictions.append(transcription[0].lower())
      references.append(dataset[i]["transcript"])

  # Calculate the per score
  per = load("cer")
  per_score = per.compute(predictions=predictions, references=references)

  return per_score, predictions, references

In [None]:
from google.colab import drive
import pandas as pd
from datasets import load_dataset

# Mount Google Drive
drive.mount('/content/drive')
folder_path = '/content/drive/MyDrive/train'
csv_file_path = folder_path + '/utterances.csv'
data = load_dataset('csv', data_files=csv_file_path)

# Cast the 'filename' column to Audio type
sampling_rate = 16000  # Replace with your desired sampling rate
data = data.cast_column('filename', Audio(sampling_rate=sampling_rate))
asr_data = data['train']

'''
  ******************** For debugging ********************
'''
asr_data = asr_data.select(range(10))
'''
  ******************** For debugging ********************
'''

processor = Wav2Vec2Processor.from_pretrained("model_staging")
model = Wav2Vec2ForCTC.from_pretrained(model_repo_path)

asr_data = asr_data.map(remove_special_characters)
asr_data = asr_data.map(prepare_dataset, num_proc=4)
asr_data = asr_data.filter(lambda x: x < 25 * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

Mounted at /content/drive


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

config.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of the model checkpoint at Aanchan/psst_model_cer_2 were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Aanchan/psst_model_cer_2 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a d

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10 [00:00<?, ? examples/s]



Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
# No language model
per_score_no_lm, predictions_no_lm, references_no_lm = evaluateModel(processor, model, asr_data)

print(f"PER (no LM): {per_score_no_lm}")

100%|██████████| 10/10 [00:06<00:00,  1.51it/s]


PER (no LM): 0.5


In [None]:
#Trigram language mdoel
per_score_trigram, predictions_trigram, references_trigram = evaluateModel(processor, model, asr_data, trigram_model_path)

print(f"PER (trigram): {per_score_trigram}")

INFO - Using arpa instead of binary LM file, decoder instantiation might be slow.
INFO - Alphabet determined to be of regular style.
INFO - Found <pad> in vocabulary, interpreted as a CTC blank token, substituting with .
INFO - Found <unk> in vocabulary, interpreting as unknown token, substituting with ⁇.
  0%|          | 0/10 [00:00<?, ?it/s]


ValueError: ignored