In [2]:
import os
import numpy as np
import phonemizer
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
import keras as keras
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
import nltk
from nltk.tokenize import word_tokenize
from tqdm import tqdm

nltk.download('punkt')  # Download the necessary data for tokenization


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soumy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
gpus = tf.config.experimental.list_physical_devices('GPU')


if gpus:
    for gpu in gpus:
        print(gpu.name)
        tf.config.experimental.set_memory_growth(gpu, True)


tf.config.functions_run_eagerly()


/physical_device:GPU:0


False

In [4]:
import csv

# Function to load text and audio paths
def load_data_paths(dataset_path):
    text_paths = []
    audio_paths = []
    
    for root, _, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".csv"):
                text_paths.append(os.path.join(root, file))
            elif file.endswith(".wav"):
                audio_paths.append(os.path.join(root, file))
    
    return text_paths, audio_paths

# Load text from text paths
def load_texts(text_paths):
    texts = []
    for text_path in text_paths:
        with open(text_path, 'r', encoding='utf-8') as csvfile:
            for line in csvfile:
                fields = line.strip().split('|')
                if len(fields) >= 3:  # Ensure there are at least 3 fields
                    sentence = fields[2]
                    texts.append(sentence)

    return texts

# Preprocess text
path_to_ljspeech = 'LJSpeech-1.1'
text_paths, audio_paths = load_data_paths(path_to_ljspeech)
texts = load_texts(text_paths)

In [5]:
def preprocess_text(texts):
    token_texts = []
    print("Preprocessing text")
    for text in texts:
        tokenized_texts = word_tokenize(text)
        #print(tokenized_texts)
        token_texts.append(tokenized_texts)
    return token_texts


preprocessed_texts=preprocess_text(texts)


Preprocessing text


In [6]:
  
# Preprocess audio
def preprocess_audio(audio_paths, sample_rate=5000, n_mels=120):
    print("Preprocessing audio")
    mel_specs = []
    for path in audio_paths:
        audio, _ = librosa.load(path, sr=sample_rate)
        
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sample_rate, n_mels=n_mels)
        mel_specs.append(mel_spec)
    return mel_specs

# Load and preprocess LJSpeech data

preprocessed_mel_specs = preprocess_audio(audio_paths)

Preprocessing audio


In [7]:

print(len(preprocessed_mel_specs))
print(len(preprocessed_texts))

13100
13100


In [8]:
import gc
gc.collect()

3322

In [9]:
def convert_to_db(mel_specs):
    return 10 * np.log10(mel_specs + 1e-9)

preprocessed_mel_specs_db = [convert_to_db(mel_spec) for mel_spec in preprocessed_mel_specs]

# Split data into train and validation sets
train_size = int(0.8 * len(preprocessed_texts))
train_mel_specs, val_mel_specs = preprocessed_mel_specs_db[:train_size], preprocessed_mel_specs_db[:train_size]

print("Length of preprocessed_mel_specs_db:", len(preprocessed_mel_specs_db))
print("Train size:", train_size)
print("Length of train_mel_specs:", len(train_mel_specs))
print("Length of val_mel_specs:", len(val_mel_specs))

# Find the maximum number of frames among all mel spectrograms
max_frames = max(spec.shape[1] for spec in train_mel_specs)

# Pad or truncate mel spectrograms to have the same number of frames
padded_train_mel_specs = [np.pad(spec, ((0, 0), (0, max_frames - spec.shape[1])), mode='constant') for spec in train_mel_specs]
padded_val_mel_specs = [np.pad(spec, ((0, 0), (0, max_frames - spec.shape[1])), mode='constant') for spec in val_mel_specs]

# Define the number of frames (time steps) and number of mel bins
n_frames = padded_train_mel_specs[0].shape[1]
n_mels = padded_train_mel_specs[0].shape[0]



Length of preprocessed_mel_specs_db: 13100
Train size: 10480
Length of train_mel_specs: 10480
Length of val_mel_specs: 10480


In [10]:

# LSTM Model
model = Sequential([
    LSTM(units=256, return_sequences=True, input_shape=(n_frames, n_mels)),
    LSTM(units=256, return_sequences=True),
    Dense(units=n_mels)
])

optimizer = Adam(learning_rate=0.001)
loss_fn = MeanSquaredError()

# Compile the model
model.compile(optimizer=optimizer, loss=loss_fn)

# Reshape the input mel spectrograms
reshaped_train_mel_specs = np.array(padded_train_mel_specs).reshape(-1, n_frames, n_mels)
reshaped_val_mel_specs = np.array(padded_val_mel_specs).reshape(-1, n_frames, n_mels)

# Define the number of epochs and batch size
epochs = 1000
batch_size = 32

# Fit the model on training data
model.fit(reshaped_train_mel_specs, reshaped_train_mel_specs,
          validation_data=(reshaped_val_mel_specs, reshaped_val_mel_specs),
          batch_size=batch_size,
          epochs=epochs)

# Save the trained model
model.save('tts_model3.h5')

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78