<a href="https://colab.research.google.com/github/Rumeysakeskin/Automatic-Speech-Recognition-in-Turkish/blob/main/training%26tokenizer_for_sub_word_encoding_CTC_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# If you're using Google Colab and not running locally, run this cell.
## Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]
!apt-get update && apt-get install -y libsndfile1 ffmpeg
!pip install Cython tensorflow==2.11.0 Pygments==2.6.1 pynini==2.1.5 nemo_toolkit[all]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9674 sha256=946e5bcf69414b56563eb3dc6670266ea7a1cf070b546515a2e5587de730ba5c
  Stored in directory: /root/.cache/pip/wheels/bd/a8/c3/3cf2c14a1837a4e04bd98631724e81f33f462d86a1d895fae0
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsndfile1 is already the newest version (1.0.28-4ubuntu0.18.04.2).
ffmpeg is already the newest version (7:3.4.11-0ubuntu0.1).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to rem

In [2]:
import os
if not os.path.exists("scripts/process_asr_text_tokenizer.py"):
  !wget -P scripts/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/scripts/tokenizers/process_asr_text_tokenizer.py

--2023-01-11 05:51:56--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/scripts/tokenizers/process_asr_text_tokenizer.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13860 (14K) [text/plain]
Saving to: ‘scripts/process_asr_text_tokenizer.py’


2023-01-11 05:51:56 (41.5 MB/s) - ‘scripts/process_asr_text_tokenizer.py’ saved [13860/13860]



In [3]:
LANGUAGE = "tr"
tokenizer_dir = os.path.join('tokenizers', LANGUAGE)

In [4]:
# Manifest Utils
from tqdm.auto import tqdm
import json

def read_manifest(path):
    manifest = []
    with open(path, 'r') as f:
        for line in tqdm(f, desc="Reading manifest data"):
            line = line.replace("\n", "")
            data = json.loads(line)
            manifest.append(data)
    return manifest

from collections import defaultdict

def get_charset(manifest_data):
    charset = defaultdict(int)
    for row in tqdm(manifest_data, desc="Computing character set"):
        text = row['text']
        for character in text:
            charset[character] += 1
    return charset

In [6]:
train_manifest = "scripts/manifest_100522.jsonl"
train_manifest_data = read_manifest(train_manifest)
train_charset = get_charset(train_manifest_data)
train_set = set(train_charset.keys())

Reading manifest data: 0it [00:00, ?it/s]

Computing character set:   0%|          | 0/32531 [00:00<?, ?it/s]

In [7]:
# << VOCAB SIZE can be changed to any value larger than (len(train_dev_set) + 2)! >>
VOCAB_SIZE = len(train_set) + 2
VOCAB_SIZE

36

In [8]:

#@title Tokenizer Config { display-mode: "form" }
TOKENIZER_TYPE = "unigram" #@param ["bpe", "unigram"]

In [9]:
!python scripts/process_asr_text_tokenizer.py \
  --manifest=$train_manifest \
  --vocab_size=$VOCAB_SIZE \
  --data_root=$tokenizer_dir \
  --tokenizer="spe" \
  --spe_type=$TOKENIZER_TYPE \
  --spe_character_coverage=1.0 \
  --no_lower_case \
  --log

[NeMo W 2023-01-11 05:56:33 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
INFO:root:Finished extracting manifest : scripts/manifest_100522.jsonl
INFO:root:Finished extracting all manifests ! Number of sentences : 32531
[NeMo I 2023-01-11 05:56:34 sentencepiece_tokenizer:315] Processing tokenizers/tr/text_corpus/document.txt and store at tokenizers/tr/tokenizer_spe_unigram_v36
sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=tokenizers/tr/text_corpus/document.txt --model_prefix=tokenizers/tr/tokenizer_spe_unigram_v36/tokenizer --vocab_size=36 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=unigram --character_coverage=1.0 --bos_id=-1 --eos_id=-1
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: tokenizers/tr/text_corpus/document.txt
  input_format: 
  model_prefix: tokenizers/tr/tokenizer_spe_unigram_v36/tokenizer
  model_type: UNIGRAM
  vocab_size: 36
  self_test_sample_

In [10]:
TOKENIZER_DIR = f"{tokenizer_dir}/tokenizer_spe_{TOKENIZER_TYPE}_v{VOCAB_SIZE}/"
print("Tokenizer directory :", TOKENIZER_DIR)

Tokenizer directory : tokenizers/tr/tokenizer_spe_unigram_v36/


In [11]:
# Number of tokens in tokenizer - 
with open(os.path.join(TOKENIZER_DIR, 'tokenizer.vocab')) as f:
  tokens = f.readlines()

num_tokens = len(tokens)
print("Number of tokens : ", num_tokens)

Number of tokens :  36


In [12]:
if num_tokens < VOCAB_SIZE:
    print(
        f"The text in this dataset is too small to construct a tokenizer "
        f"with vocab size = {VOCAB_SIZE}. Current number of tokens = {num_tokens}. "
        f"Please reconstruct the tokenizer with fewer tokens"
    )

In [None]:
!zip -r tokenizers.zip /content/tokenizers

  adding: content/tokenizers/ (stored 0%)
  adding: content/tokenizers/tr/ (stored 0%)
  adding: content/tokenizers/tr/tokenizer_spe_unigram_v36/ (stored 0%)
  adding: content/tokenizers/tr/tokenizer_spe_unigram_v36/tokenizer.model (deflated 42%)
  adding: content/tokenizers/tr/tokenizer_spe_unigram_v36/tokenizer.vocab (deflated 40%)
  adding: content/tokenizers/tr/tokenizer_spe_unigram_v36/vocab.txt (deflated 40%)
  adding: content/tokenizers/tr/text_corpus/ (stored 0%)
  adding: content/tokenizers/tr/text_corpus/document.txt (deflated 66%)


In [None]:
from google.colab import files
files.download('tokenizers.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Training

In [13]:
import os
if not os.path.exists("configs/config_bpe.yaml"):
  !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/citrinet/config_bpe.yaml

--2023-01-11 05:56:47--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/citrinet/config_bpe.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4375 (4.3K) [text/plain]
Saving to: ‘configs/config_bpe.yaml’


2023-01-11 05:56:47 (47.3 MB/s) - ‘configs/config_bpe.yaml’ saved [4375/4375]



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Getting the subwords of the transcript or tokenizing a dataset using the same tokenizer as the ASR model.

In [46]:
import nemo.collections.asr as nemo_asr
from ruamel.yaml import YAML
import pytorch_lightning as pl
from omegaconf import DictConfig, OmegaConf, open_dict
import copy
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
import os

config_path = "configs/config_bpe.yaml"

yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)

params['model']['train_ds']['manifest_filepath'] = "scripts/manifest_100522.jsonl"
params['model']['validation_ds']['manifest_filepath'] = "scripts/manifest_100522.jsonl"

params['model']['tokenizer']['dir'] = TOKENIZER_DIR
params['model']['tokenizer']['type'] = 'bpe'

first_asr_model = nemo_asr.models.EncDecCTCModelBPE(cfg=DictConfig(params['model']))
first_asr_model.change_vocabulary(new_tokenizer_dir=TOKENIZER_DIR, new_tokenizer_type="bpe")

tokenizer = first_asr_model.tokenizer
print("tokenizer:",tokenizer)
tokens = tokenizer.text_to_tokens("merhaba nasılsın")
print("tokens:",tokens)
token_ids = tokenizer.text_to_ids("merhaba nasılsın")
print("token_ids:",token_ids)
subwords = tokenizer.ids_to_tokens(token_ids)
print("subwords:",subwords)
text = tokenizer.ids_to_text(token_ids)
print("text:",text)

[NeMo I 2023-01-11 08:19:51 mixins:170] Tokenizer SentencePieceTokenizer initialized with 36 tokens
[NeMo I 2023-01-11 08:19:51 ctc_bpe_models:64] 
    Replacing placeholder number of classes (-1) with actual number of classes - 36
[NeMo I 2023-01-11 08:19:53 collections:193] Dataset loaded with 32531 files totalling 27.57 hours
[NeMo I 2023-01-11 08:19:53 collections:194] 0 files were filtered totalling 0.00 hours
[NeMo I 2023-01-11 08:19:55 collections:193] Dataset loaded with 32531 files totalling 27.57 hours
[NeMo I 2023-01-11 08:19:55 collections:194] 0 files were filtered totalling 0.00 hours
[NeMo I 2023-01-11 08:19:55 features:267] PADDING: 16


[NeMo W 2023-01-11 08:19:55 modelPT:236] You tried to register an artifact under config key=tokenizer.model_path but an artifact for it has already been registered.
[NeMo W 2023-01-11 08:19:55 modelPT:236] You tried to register an artifact under config key=tokenizer.vocab_path but an artifact for it has already been registered.
[NeMo W 2023-01-11 08:19:55 modelPT:236] You tried to register an artifact under config key=tokenizer.spe_tokenizer_vocab but an artifact for it has already been registered.


[NeMo I 2023-01-11 08:19:55 mixins:170] Tokenizer SentencePieceTokenizer initialized with 36 tokens
[NeMo I 2023-01-11 08:19:55 ctc_bpe_models:296] 
    Replacing old number of classes (36) with new number of classes - 36
[NeMo I 2023-01-11 08:19:55 ctc_bpe_models:338] Changed tokenizer to ['<unk>', '▁', 'a', 'e', 'i', 'n', 'l', 'ı', 'k', 'r', 'm', 't', 'u', 'd', 'y', 's', 'b', 'o', 'z', 'ü', 'ş', 'ar', 'g', 'ç', 'h', 'v', 'p', 'c', 'f', 'ö', 'j', 'w', 'q', '̇', 'x', 'ğ'] vocabulary.
tokenizer: <nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer object at 0x7f5d2ee7bf10>
tokens: ['▁', 'm', 'e', 'r', 'h', 'a', 'b', 'a', '▁', 'n', 'a', 's', 'ı', 'l', 's', 'ı', 'n']
token_ids: [1, 10, 3, 9, 24, 2, 16, 2, 1, 5, 2, 15, 7, 6, 15, 7, 5]
subwords: ['▁', 'm', 'e', 'r', 'h', 'a', 'b', 'a', '▁', 'n', 'a', 's', 'ı', 'l', 's', 'ı', 'n']
text: merhaba nasılsın


In [None]:
def training():

    N_GPUS = 1
    N_DEVICES = 1
    EPOCHS = 10
   
    config_path = "configs/config_bpe.yaml"

    yaml = YAML(typ='safe')
    with open(config_path) as f:
        params = yaml.load(f)

    params['model']['train_ds']['manifest_filepath'] = "scripts/manifest_100522.jsonl"
    params['model']['validation_ds']['manifest_filepath'] = "scripts/manifest_100522.jsonl"

    params['model']['train_ds']['batch_size'] = 32 * N_GPUS * N_DEVICES

    params['model']['tokenizer']['dir'] = TOKENIZER_DIR
    params['model']['tokenizer']['type'] = 'bpe'

    # model_to_load = "epoch-99.ckpt"

    first_asr_model = nemo_asr.models.EncDecCTCModelBPE(cfg=DictConfig(params['model']))

    # first_asr_model = first_asr_model.load_from_checkpoint(model_to_load)

    new_opt = copy.deepcopy(params['model']['optim'])

    new_opt['lr'] = 0.001
    # Point to the data we'll use for fine-tuning as the training set
    first_asr_model.setup_training_data(train_data_config=params['model']['train_ds'])
    # Point to the new validation data for fine-tuning
    first_asr_model.setup_validation_data(val_data_config=params['model']['validation_ds'])
    # assign optimizer config
    first_asr_model.setup_optimization(optim_config=DictConfig(new_opt))

    first_asr_model.change_vocabulary(new_tokenizer_dir=TOKENIZER_DIR, new_tokenizer_type="bpe")

    wandb_logger = WandbLogger(name="Citrinet", project="TURKISH_FINETUNING")
    # used for saving models
    save_path = os.path.join(os.getcwd(),"TURKISH_FINETUNING" + "_" + "Citrinet_models")
    checkpoint_callback = ModelCheckpoint(
        dirpath=save_path,
        save_top_k= -1,
        verbose=True,
        monitor='val_loss',
        mode='min',
    )

    trainer = pl.Trainer(devices=N_GPUS, accelerator='cpu',num_nodes=N_DEVICES,  # accelerator='ddp'
                  max_epochs=EPOCHS,
                  logger=wandb_logger, log_every_n_steps=1,
                  val_check_interval=1.0, enable_checkpointing=checkpoint_callback)

    first_asr_model.set_trainer(trainer)

    trainer.fit(first_asr_model)

if __name__ == '__main__':
    training()