In [None]:
import pandas as pd

df = pd.read_csv("DATA/train_valid_without_errs.csv")
df.head()

In [None]:
transcripts = df["transcripts"].tolist()
len(transcripts)

In [None]:
with open("DATA/train.txt", "w") as f:
    for transcript in transcripts:
        if '\t' in transcript:
            f.write(transcript.split('\t')[0].strip() + "\n")
        else:
            f.write(transcript.strip() + "\n")

## Normalizing text corpus (transcriptions of train dataset)

In [None]:
from banglanlptoolkit import BnNLPNormalizerPlus

normalizer = BnNLPNormalizerPlus()
res = normalizer("DATA/train.txt")

## Training text tokenizer with custom data and vocab size

In [None]:
python process_asr_text_tokenizer.py --data_file="DATA/trainnormalized.txt" \
    --data_root="tokenizer" \
    --vocab_size=256 \
    --tokenizer="spe" \
    --spe_type="bpe" \
    --log

## Edit manifest file to remove extra tabs and newlines

In [1]:
import pandas as pd

df_manifest = pd.read_json("DATA/nemo_manifest.json", lines=True, orient="records")
df_manifest.head()

Unnamed: 0,audio_filepath,duration,text
0,/home/sami/workspace/nemo-asr-training/DATA/bn...,6.3,আমি এই চেষ্টাটি একদমই করিনি
1,/home/sami/workspace/nemo-asr-training/DATA/bn...,8.42,এজন্য আগামীকাল ও মঙ্গলবার মুখ্যমন্ত্রীর দপ্তর ...
2,/home/sami/workspace/nemo-asr-training/DATA/bn...,4.68,ফ্রোজেন ওয়াটার হতিছে যেটি পানি ঠান্ডায় জমে বরফ...
3,/home/sami/workspace/nemo-asr-training/DATA/cv...,2.268,এই কাজের জন্য তিনি নোবেল পুরস্কার লাভ করেন।
4,/home/sami/workspace/nemo-asr-training/DATA/cv...,3.636,তিনি অটল দাঁড়িয়ে রইলেন যখন পরাজিতরা পালিয়ে গেল।


In [2]:
df_manifest["duration"].min(), df_manifest["duration"].max()

(0.07200000000000001, 39.996)

In [None]:
# df_manifest["audio_filepath"] = "/home/sami/workspace/nemo-asr-training/DATA/" + df_manifest["audio_filepath"]
# df_manifest.head()

In [None]:
# from banglanlptoolkit import BnNLPNormalizer
# from tqdm import tqdm

# tqdm.pandas()

# normalizer = BnNLPNormalizer(allow_en=True)
# df_manifest["text"] = df_manifest["text"].progress_apply(normalizer.normalize_bn)
# df_manifest.head()

In [None]:
df_manifest["text"] = df_manifest["text"].apply(lambda x: x.split("\t")[0].strip() if "\t" in x else x.strip())
df_manifest.head()

In [None]:
df_manifest.info()

In [None]:
df_manifest.to_json("DATA/nemo_manifest.json", lines=True, orient="records", force_ascii=False)

## Split Manifest into train and validation sets

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(df_manifest, test_size=0.001, random_state=42)

In [None]:
df_train.to_json("DATA/train_manifest.json", lines=True, orient="records", force_ascii=False)
df_valid.to_json("DATA/valid_manifest.json", lines=True, orient="records", force_ascii=False)

## Convert to Bucketing Dataset

In [None]:
!python convert_to_tarred_audio_dataset.py \
        --manifest_path=DATA/train_manifest.json \
        --target_dir=DATA/train_bucket \
        --num_shards=128 \
        --max_duration=40 \
        --min_duration=0.1 \
        --shuffle \
        --shuffle_seed=1 \
        --sort_in_shards \
        --workers=24 \
        --buckets_num=4

## Training

In [None]:
!python speech_to_text_hybrid_rnnt_ctc_bpe.py 