In [181]:
#hparams.py

"""Hyper parameters."""
__author__ = 'Erdene-Ochir Tuguldur'


class HParams:
    """Hyper parameters"""

    disable_progress_bar = False  # set True if you don't want the progress bar in the console

    logdir = "logdir"  # log dir where the checkpoints and tensorboard files are saved

    # audio.py options, these values are from https://github.com/Kyubyong/dc_tts/blob/master/hyperparams.py
    reduction_rate = 4  # melspectrogram reduction rate, don't change because SSRN is using this rate
    n_fft = 2048 # fft points (samples)
    n_mels = 80  # Number of Mel banks to generate
    power = 1.5  # Exponent for amplifying the predicted magnitude
    n_iter = 50  # Number of inversion iterations
    preemphasis = .97
    max_db = 100
    ref_db = 20
    sr = 22050  # Sampling rate
    frame_shift = 0.0125  # seconds
    frame_length = 0.05  # seconds
    hop_length = int(sr * frame_shift)  # samples. =276.
    win_length = int(sr * frame_length)  # samples. =1102.
    max_N = 180  # Maximum number of characters.
    max_T = 210  # Maximum number of mel frames.

    e = 128  # embedding dimension
    d = 256  # Text2Mel hidden unit dimension
    c = 512+128  # SSRN hidden unit dimension

    dropout_rate = 0.05  # dropout

    # Text2Mel network options
    text2mel_lr = 0.005  # learning rate
    text2mel_max_iteration = 3000  # max train step
    text2mel_weight_init = 'none'  # 'kaiming', 'xavier' or 'none'
    text2mel_normalization = 'layer'  # 'layer', 'weight' or 'none'
    text2mel_basic_block = 'gated_conv'  # 'highway', 'gated_conv' or 'residual'

    # SSRN network options
    ssrn_lr = 0.0005  # learning rate
    ssrn_max_iteration = 1500  # max train step
    ssrn_weight_init = 'kaiming'  # 'kaiming', 'xavier' or 'none'
    ssrn_normalization = 'weight'  # 'layer', 'weight' or 'none'
    ssrn_basic_block = 'residual'  # 'highway', 'gated_conv' or 'residual'


In [None]:
!pip install tensorflow

Collecting tensorflow
  Obtaining dependency information for tensorflow from https://files.pythonhosted.org/packages/ed/30/310fee0477ce46f722c561dd7e21eebca0d1d29bdb3cf4a2335b845fbba4/tensorflow-2.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading tensorflow-2.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Obtaining dependency information for absl-py>=1.0.0 from https://files.pythonhosted.org/packages/01/e4/dc0a1dcc4e74e08d7abedab278c795eef54a224363bb18f5692f416d834f/absl_py-2.0.0-py3-none-any.whl.metadata
  Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting flatbuffers>=23.1.21 (from tensorflow)
  Obtaining dependency information for flatbuffers>=23.1.21 from https://files.pythonhosted.org/packages/6f/12/d5c79ee252793ffe845d58a913197bfa02ae9a0b

In [182]:
#audio.py

"""These methods are copied from https://github.com/Kyubyong/dc_tts/"""

import os
import copy
import librosa
import scipy.io.wavfile
import numpy as np
from tqdm import tqdm
from scipy import signal
hp = HParams()

def spectrogram2wav(mag):
    '''# Generate wave file from linear magnitude spectrogram
    Args:
      mag: A numpy array of (T, 1+n_fft//2)
    Returns:
      wav: A 1-D numpy array.
    '''
    # transpose
    mag = mag.T

    # de-noramlize
    mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db

    # to amplitude
    mag = np.power(10.0, mag * 0.05)

    # wav reconstruction
    wav = griffin_lim(mag ** hp.power)

    # de-preemphasis
    wav = signal.lfilter([1], [1, -hp.preemphasis], wav)

    # trim
    wav, _ = librosa.effects.trim(wav)

    return wav.astype(np.float32)


def griffin_lim(spectrogram):
    '''Applies Griffin-Lim's raw.'''
    X_best = copy.deepcopy(spectrogram)
    for i in range(hp.n_iter):
        X_t = invert_spectrogram(X_best)
        est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length)
        phase = est / np.maximum(1e-8, np.abs(est))
        X_best = spectrogram * phase
    X_t = invert_spectrogram(X_best)
    y = np.real(X_t)

    return y


def invert_spectrogram(spectrogram):
    '''Applies inverse fft.
    Args:
      spectrogram: [1+n_fft//2, t]
    '''
    return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")


def get_spectrograms(fpath):
    '''Parse the wave file in `fpath` and
    Returns normalized melspectrogram and linear spectrogram.
    Args:
      fpath: A string. The full path of a sound file.
    Returns:
      mel: A 2d array of shape (T, n_mels) and dtype of float32.
      mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
    '''
    # Loading sound file
    y, sr = librosa.load(fpath, sr=hp.sr)

    # Trimming
    y, _ = librosa.effects.trim(y)

    # Preemphasis
    y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])

    # stft
    linear = librosa.stft(y=y,
                          n_fft=hp.n_fft,
                          hop_length=hp.hop_length,
                          win_length=hp.win_length)

    # magnitude spectrogram
    mag = np.abs(linear)  # (1+n_fft//2, T)

    # mel spectrogram
    #mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels)  # (n_mels, 1+n_fft//2)
    mel_basis = librosa.filters.mel(sr=22050, n_fft=2048, n_mels=128)
    mel = np.dot(mel_basis, mag)  # (n_mels, t)

    # to decibel
    mel = 20 * np.log10(np.maximum(1e-5, mel))
    mag = 20 * np.log10(np.maximum(1e-5, mag))

    # normalize
    mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
    mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)

    # Transpose
    mel = mel.T.astype(np.float32)  # (T, n_mels)
    mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)

    return mel, mag


def save_to_wav(mag, filename):
    """Generate and save an audio file from the given linear spectrogram using Griffin-Lim."""
    wav = spectrogram2wav(mag)
    scipy.io.wavfile.write(filename, hp.sr, wav)


def preprocess(dataset_path, speech_dataset):
    """Preprocess the given dataset."""
    wavs_path = os.path.join(dataset_path, 'wavs')
    mels_path = os.path.join(dataset_path, 'mels')
    if not os.path.isdir(mels_path):
        os.mkdir(mels_path)
    mags_path = os.path.join(dataset_path, 'mags')
    if not os.path.isdir(mags_path):
        os.mkdir(mags_path)

    for fname in tqdm(speech_dataset.fnames):
        mel, mag = get_spectrograms(os.path.join(wavs_path, '%s.wav' % fname))

        t = mel.shape[0]
        # Marginal padding for reduction shape sync.
        num_paddings = hp.reduction_rate - (t % hp.reduction_rate) if t % hp.reduction_rate != 0 else 0
        mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
        mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
        # Reduction
        mel = mel[::hp.reduction_rate, :]

        np.save(os.path.join(mels_path, '%s.npy' % fname), mel)
        np.save(os.path.join(mags_path, '%s.npy' % fname), mag)


In [183]:
#ljspeech.py

"""Data loader for the LJSpeech dataset. See: https://keithito.com/LJ-Speech-Dataset/"""
import os
import re
import codecs
import unicodedata
import numpy as np

from torch.utils.data import Dataset

#vocab = "PE abcdefghijklmnopqrstuvwxyz'.?"  # P: Padding, E: EOS.
vocab = "PE অআইঈউঊঋএঐওঔা ি ী ু ূ ৃ ে ৈ ো ৌক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণত থ দ ধ ন প ফ ব ভমযরলশষসহড়ঢ়য়ৎংঃঁ্ঽ‍্য‍  ‍্র'"
#vocab = "PE অআইঈউঊঋএঐওঔা ি ী ু ূ ৃ ে ৈ ো ৌক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণত থ দ ধ ন প ফ ব ভমযরলশষসহড়ঢ়য়ৎংঃঁ্ঽ‍্য‍  ‍্র'.?"
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}


def text_normalize(text):
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                   if unicodedata.category(char) != 'Mn')  # Strip accents
    #print(text)
    #text = text.lower()
    text = re.sub("[^{}]()".format(vocab), " ", text)
    text = re.sub("[ ]+", " ", text)
    return text

def read_metadata(metadata_file):
    fnames, text_lengths, texts = [], [], []
    transcript = os.path.join(metadata_file)
    #transcript = "/content/drive/My Drive/TTS_B/datasets/LJSpeech-1.1/line_index.tsv"
    lines = codecs.open(transcript, 'r', 'utf-8').readlines()
    for line in lines:
        fname, text = line.strip().split("\t")

        fnames.append(fname)

        text = text_normalize(text) + "E"  # E: EOS
        text = [char2idx[char] for char in text]
        text_lengths.append(len(text))
        texts.append(np.array(text, np.float32))

    return fnames, text_lengths, texts


def get_test_data(sentences, max_n):
    normalized_sentences = [text_normalize(line).strip() + "E" for line in sentences]  # text normalization, E: EOS
    texts = np.zeros((len(normalized_sentences), max_n + 1), np.float32)
    for i, sent in enumerate(normalized_sentences):
        texts[i, :len(sent)] = [char2idx[char] for char in sent]
    return texts


class LJSpeech(Dataset):
    def __init__(self, keys, dir_name='bn_bd'):
        self.keys = keys
        self.path = os.path.join("/home/nipu/ml", dir_name)
        self.fnames, self.text_lengths, self.texts = read_metadata(os.path.join(self.path, "line_index.tsv"))

    def slice(self, start, end):
        self.fnames = self.fnames[start:end]
        self.text_lengths = self.text_lengths[start:end]
        self.texts = self.texts[start:end]

    def __len__(self):
        return len(self.fnames)

    def __getitem__(self, index):
        data = {}
        if 'texts' in self.keys:
            data['texts'] = self.texts[index]
        if 'mels' in self.keys:
            # (39, 80)
            data['mels'] = np.load(os.path.join(self.path, 'mels', "%s.npy" % self.fnames[index]))
        if 'mags' in self.keys:
            # (39, 80)
            data['mags'] = np.load(os.path.join(self.path, 'mags', "%s.npy" % self.fnames[index]))
        if 'mel_gates' in self.keys:
            data['mel_gates'] = np.ones(data['mels'].shape[0], dtype=np.int)  # TODO: because pre processing!
        if 'mag_gates' in self.keys:
            data['mag_gates'] = np.ones(data['mags'].shape[0], dtype=np.int)  # TODO: because pre processing!
        return data


he librosa.filters.mel() function creates a Mel filter-bank. This produces a linear transformation matrix to project FFT bins onto Mel-frequency bins.

The Mel scale is a quasi-logarithmic function of acoustic frequency designed such that perceptually similar pitch intervals (e.g. octaves) appear equal in width over the full hearing range. This makes it a useful scale for representing and analyzing audio signals, as it more closely matches the way humans perceive sound.

Mel filter-banks are commonly used in a variety of audio processing tasks, such as speech recognition, music information retrieval, and automatic speaker recognition.

The librosa.filters.mel() function takes the following arguments:

sr: The sampling rate of the incoming signal (in Hz).
n_fft: The number of FFT components.
n_mels: The number of Mel bands to generate.
fmin: The lowest frequency (in Hz).
fmax: The highest frequency (in Hz). If None, use fmax = sr / 2.0.
htk: Whether to use the HTK formula instead of Slaney.
norm: The type of normalization to apply to the filters. Can be None, slaney, or a number.
dtype: The data type of the output basis.
The function returns a NumPy array of shape (n_mels, 1 + n_fft / 2), which represents the Mel filter-bank.

Here is an example of how to use the librosa.filters.mel() function:

Python
import librosa

# Create a Mel filter-bank with 128 Mel bands
melfb = librosa.filters.mel(sr=22050, n_fft=2048, n_mels=128)

# Compute the Mel spectrogram of an audio signal
audio_data, sr = librosa.load('audio.wav')
mel_spectrogram = librosa.feature.melspectrogram(audio_data, sr=sr, S=melfb)
Use code with caution. Learn more
The mel_spectrogram variable will now contain a NumPy array of shape (n_mels, n_frames), which represents the Mel spectrogram of the audio signal.

Mel filter-banks are a powerful tool for audio processing, and the librosa.filters.mel() function makes it easy to create them in Python.

In [184]:
import os
import re
import codecs
import unicodedata
import numpy as np
import pandas as pd

from torch.utils.data import Dataset

In [185]:
vocab = "PE অআইঈউঊঋএঐওঔা ি ী ু ূ ৃ ে ৈ ো ৌক খ গ ঘ ঙ চ ছ জ ঝ ঞ ট ঠ ড ঢ ণত থ দ ধ ন প ফ ব ভমযরলশষসহড়ঢ়য়ৎংঃঁ্ঽ‍্য‍  ‍্র'"
char2idx = {char: idx for idx, char in enumerate(vocab)}
idx2char = {idx: char for idx, char in enumerate(vocab)}

In [186]:
def text_normalize(text):
    text = ''.join(char for char in unicodedata.normalize('NFKC', text) # NFKC = 
                   if unicodedata.category(char) != 'Mn')  # Strip accents
    print(text)
    text = re.sub("[^{}]()".format(vocab), " ", text)
    text = re.sub("[ ()]+", " ", text)
    return text

In [187]:
ss = text_normalize("আমার,++ নাম :(শামীম) ক্কা? কক্ষ মুখ্য পত্র, বন্ধু")
print(unicodedata.normalize('NFKC', "আমার,++ নাম :(শামীম) ক্কা? কক্ষ মুখ্য পত্র, বন্ধু"))

আমার,++ নাম :(শামীম) ককা? ককষ মখয পতর, বনধ
আমার,++ নাম :(শামীম) ক্কা? কক্ষ মুখ্য পত্র, বন্ধু


In [188]:
lines = codecs.open('bn_bd/line_index.tsv', 'r', 'utf-8').readlines()
len(lines[1])
line = lines[5]
print(line)

ban_00737_00107291991	কেয়া ডেভেলপারস দেশের বিভিন্ন স্থানে স্থাপনা তৈরি করে থাকে



In [189]:
label, text = line.strip().split("\t")
print(label.strip(), "\n", text.strip())

ban_00737_00107291991 
 কেয়া ডেভেলপারস দেশের বিভিন্ন স্থানে স্থাপনা তৈরি করে থাকে


In [190]:
# TTS_Bn
import os
import sys
import torch
from os.path import exists, join, expanduser


In [191]:
datasets_path = '/home/nipu/ml'
dataset_path = os.path.join(datasets_path, 'bn_bd')
print(dataset_path)

if os.path.isdir(dataset_path) and False:
  print("BN dataset folder already exists")


/home/nipu/ml/bn_bd


In [None]:
!pip install tensorflow==2.0.0-alpha0

In [192]:
print("pre processing...")
lj_speech = LJSpeech([])
# print(lj_speech)
# print(dataset_path, lj_speech)
preprocess(dataset_path, lj_speech)

pre processing...
এইচআর টেকসটাইল বাংলাদেশের ভেতরে একাধিক আউটলেটের মাধযমে শাডি বাচচাদের পোশাক মহিলাদের পোশাক এবং অনযানয টেকসটাইল পণয উৎপাদন ও বিপণন করে
সটযানডারড বযাংক এ ইসলামী বযাংকিং এর সবিধা রযেছে
লাফারজ সরমা সিমেনট সরবাধিক বযবহত সিমেনট উৎপাদন করে
পিপলস ইনসযরেনস অব চাযনা ছেষটটি বছর আগে বযবসা চাল করে
বযগেস একটি ইনডাসটরিযাল গরপ
কেযা ডেভেলপারস দেশের বিভিনন সথানে সথাপনা তৈরি করে থাকে
ভেরাইজন কমিউনিকেশনস একটি আমেরিকান বরডবযানড ও টেলিযোগাযোগ কোমপানি
মেটলাইফ বিশবের দরত বিসতত ইনসযরেনস কোমপানি
সাইফ পাওযারটেক পরাইভেট কোমপানি হিসেবে নিবনধিত এবং পরিচালিত
অলিমপিক ইনডাসটরিজ জীবনকে সহজ করে তলেছে
বেইজিং অটোমোটিভ গরপ একটি চীনা রাষটরীয উদযোগ ও নিযনতরণকারী কোমপানি
পরিবহন খাতে আফতাব অটোমোবাইলস একটি অতি পরিচিত নাম
চাযনা কমিউনিকেশনস কনসটরাকশন চীনের বহততম বনদর নিরমাণ ও নকশা এবং ডরেজিং কোমপানি
মেঘনা কনডেনসড মিলক ইনডাসটরিজ এর অতি জনপরিয বরযানড হলো ফরেশ
মারবেনি তার কোমপানির বিশবাস হিসেবে সততা উদভাবন ও ঐকয ধারণ করে
ডাউ কেমিকযাল আমেরিকান কেমিকযাল কাউনসিলের সদসয
লিবারটি মিউচযাল ইনসযরেনস গরপ একটি আমেরিকান বিমা কো

100%|██████████| 1891/1891 [01:39<00:00, 19.07it/s]
