<a href="https://colab.research.google.com/github/Rumeysakeskin/Automatic-Speech-Recognition-in-Turkish/blob/main/tokenizer_for_sub_word_encoding_CTC_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# If you're using Google Colab and not running locally, run this cell.
## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
!apt-get update && apt-get install -y libsndfile1 ffmpeg
!pip install Cython tensorflow==2.11.0 Pygments==2.6.1 pynini==2.1.5 nemo_toolkit[all]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9674 sha256=33965f445c9746cb063afc029388719e9fb4b048228ea91d987c115271c15a37
  Stored in directory: /root/.cache/pip/wheels/bd/a8/c3/3cf2c14a1837a4e04bd98631724e81f33f462d86a1d895fae0
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsndfile1 is already the newest version (1.0.28-4ubuntu0.18.04.2).
ffmpeg is already the newest version (7:3.4.11-0ubuntu0.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to rem

In [2]:
import os
if not os.path.exists("scripts/process_asr_text_tokenizer.py"):
  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py

--2023-01-10 10:49:47--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13860 (14K) [text/plain]
Saving to: ‘scripts/process_asr_text_tokenizer.py’


2023-01-10 10:49:48 (95.9 MB/s) - ‘scripts/process_asr_text_tokenizer.py’ saved [13860/13860]



In [3]:
LANGUAGE = "tr"
tokenizer_dir = os.path.join('tokenizers', LANGUAGE)

In [4]:
# Manifest Utils
from tqdm.auto import tqdm
import json

def read_manifest(path):
    manifest = []
    with open(path, 'r') as f:
        for line in tqdm(f, desc="Reading manifest data"):
            line = line.replace("\n", "")
            data = json.loads(line)
            manifest.append(data)
    return manifest

from collections import defaultdict

def get_charset(manifest_data):
    charset = defaultdict(int)
    for row in tqdm(manifest_data, desc="Computing character set"):
        text = row['text']
        for character in text:
            charset[character] += 1
    return charset

In [5]:
train_manifest = "scripts/manifest_100522.jsonl"
train_manifest_data = read_manifest(train_manifest)
train_charset = get_charset(train_manifest_data)
train_set = set(train_charset.keys())

Reading manifest data: 0it [00:00, ?it/s]

Computing character set:   0%|          | 0/32531 [00:00<?, ?it/s]

In [6]:
# << VOCAB SIZE can be changed to any value larger than (len(train_dev_set) + 2)! >>
VOCAB_SIZE = len(train_set) + 2
VOCAB_SIZE

36

In [7]:

#@title Tokenizer Config { display-mode: "form" }
TOKENIZER_TYPE = "unigram" #@param ["bpe", "unigram"]

In [8]:
!python scripts/process_asr_text_tokenizer.py \
  --manifest=$train_manifest \
  --vocab_size=$VOCAB_SIZE \
  --data_root=$tokenizer_dir \
  --tokenizer="spe" \
  --spe_type=$TOKENIZER_TYPE \
  --spe_character_coverage=1.0 \
  --no_lower_case \
  --log

[NeMo W 2023-01-10 10:54:34 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
INFO:root:Finished extracting manifest : scripts/manifest_100522.jsonl
INFO:root:Finished extracting all manifests ! Number of sentences : 32531
[NeMo I 2023-01-10 10:54:35 sentencepiece_tokenizer:315] Processing tokenizers/tr/text_corpus/document.txt and store at tokenizers/tr/tokenizer_spe_unigram_v36
sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=tokenizers/tr/text_corpus/document.txt --model_prefix=tokenizers/tr/tokenizer_spe_unigram_v36/tokenizer --vocab_size=36 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=unigram --character_coverage=1.0 --bos_id=-1 --eos_id=-1
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: tokenizers/tr/text_corpus/document.txt
  input_format: 
  model_prefix: tokenizers/tr/tokenizer_spe_unigram_v36/tokenizer
  model_type: UNIGRAM
  vocab_size: 36
  self_test_sample_

In [9]:
TOKENIZER_DIR = f"{tokenizer_dir}/tokenizer_spe_{TOKENIZER_TYPE}_v{VOCAB_SIZE}/"
print("Tokenizer directory :", TOKENIZER_DIR)

Tokenizer directory : tokenizers/tr/tokenizer_spe_unigram_v36/


In [10]:
# Number of tokens in tokenizer - 
with open(os.path.join(TOKENIZER_DIR, 'tokenizer.vocab')) as f:
  tokens = f.readlines()

num_tokens = len(tokens)
print("Number of tokens : ", num_tokens)

Number of tokens :  36


In [11]:
if num_tokens < VOCAB_SIZE:
    print(
        f"The text in this dataset is too small to construct a tokenizer "
        f"with vocab size = {VOCAB_SIZE}. Current number of tokens = {num_tokens}. "
        f"Please reconstruct the tokenizer with fewer tokens"
    )

In [12]:
!zip -r tokenizers.zip /content/tokenizers

  adding: content/tokenizers/ (stored 0%)
  adding: content/tokenizers/tr/ (stored 0%)
  adding: content/tokenizers/tr/tokenizer_spe_unigram_v36/ (stored 0%)
  adding: content/tokenizers/tr/tokenizer_spe_unigram_v36/tokenizer.model (deflated 42%)
  adding: content/tokenizers/tr/tokenizer_spe_unigram_v36/tokenizer.vocab (deflated 40%)
  adding: content/tokenizers/tr/tokenizer_spe_unigram_v36/vocab.txt (deflated 40%)
  adding: content/tokenizers/tr/text_corpus/ (stored 0%)
  adding: content/tokenizers/tr/text_corpus/document.txt (deflated 66%)


In [14]:
from google.colab import files
files.download('tokenizers.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>