## Cloning github repositry that contains Tacotron 2


In [20]:
import os
os.chdir('/kaggle/working')

In [2]:
!git clone https://github.com/Fadi-S/tts-arabic-pytorch

fatal: destination path 'tts-arabic-pytorch' already exists and is not an empty directory.


In [3]:
!cd tts-arabic-pytorch && git pull origin master

From https://github.com/Fadi-S/tts-arabic-pytorch
 * branch            master     -> FETCH_HEAD
Already up to date.


In [None]:
!pip install gdown

## Downloading Pretrained model

In [None]:
import gdown

url = 'https://drive.google.com/uc?id=1FD2J-xUk48JPF9TeS8ZKHzDC_ZNBfLd8'

output = '/kaggle/working/'

gdown.download(url, output, quiet=False)

In [None]:
!unzip -o ar-tts-models.zip -d tts-arabic-pytorch/pretrained/

In [7]:
import os
os.chdir('/kaggle/working/tts-arabic-pytorch')

In [5]:
import pandas as pd

dataset_path = '/kaggle/input/egyptian-arabic-wavs'

# Load the index.csv file into a DataFrame
data = pd.read_csv('/kaggle/input/egyptian-arabic-wavs/index.csv')

files_list = os.listdir("/kaggle/input/egyptian-arabic-wavs/data")
files_dict = {file_name: True for file_name in files_list}

data['file_exists'] = data['audio_file'].apply(lambda x: x in files_dict)

data = data[data['file_exists'] == True]

audio_files = data['audio_file']
texts = data['text']

# Display the contents of index.csv
print(data.head())

     audio_file                                      text  gender  file_exists
0  IIqfET_1.wav    ازيكم يا جماعه عاملين ايه يا رب تكونوا  female         True
1  jGqkHh_2.wav  بخير وصحه وسعاده وهنا وكل حاجه حلوه طبعا  female         True
2  NE6jEu_3.wav     يا جماعه شايفيني بالعبايه وبالقران دي  female         True
3  Dmkv3J_4.wav   اعرفوا على طول ان انا كنت في السوق ولسه  female         True
4  H4s1qn_5.wav      طالعه فال المهم يا جماعه ان انا جايه  female         True


In [6]:
from sklearn.model_selection import train_test_split


# Split the data into training and validation sets
train_audio_files, val_audio_files, train_texts, val_texts = train_test_split(audio_files, texts, test_size=0.2, random_state=42)

train_data = pd.DataFrame({'filename': train_audio_files, 'text': train_texts})
val_data = pd.DataFrame({'filename': val_audio_files, 'text': val_texts})

In [7]:
train_data = pd.DataFrame(train_data, columns=['filename', 'text'])
val_data = pd.DataFrame(val_data, columns=['filename', 'text'])

# Display the data frame
train_data.head(10)

Unnamed: 0,filename,text
18538,CIenec_2055.wav,ابتديتها بدري قوي او ابتديت تاخدها من
41760,4RG7rS_3318.wav,لغايه النهارده لا حقيقي والله اشهد لهم
11015,BvhP3h_159.wav,بتدخل للمطبخ تشوف الاخطاء وكده بس قبل كل
42713,Ua7qh3_321.wav,عزيز النهارده احنا اه يعني يمكن تعرضنا
30385,c7xpAQ_255.wav,انا عايزك والله انا عارف ان كتير كمان من
20706,GDNg53_1205.wav,لازم اتكلم مع الدكتور وافهم منه كويس قوي
26901,51HH3k_3170.wav,كده كان حد عمله انت بتنزله على الجهاز بك
12745,zkufhO_132.wav,ونبقى ونبقى صحاب اللي هو نبقى بيبتدي بقى
2383,QpJyWS_617.wav,ا هلا عندي التزامات مطار كون اوكي بس فرق
9039,r6Cm99_224.wav,شخص مش باين على جسمه اي اعراض واضحه


### Preprocessing Text

In [8]:
import text

def preprocess(txt):
    txt = txt.replace(".", "")
    txt = txt.replace("!", "")
    txt = txt.replace(",", "")
    t_phon = text.arabic_to_buckwalter(txt)
    t_phon = text.buckwalter_to_phonemes(t_phon)
    return t_phon

In [9]:
train_phonemes,val_phonemes = [], []

for _, row in train_data.iterrows():
    train_phonemes.append(preprocess(row['text']))

for _,row in val_data.iterrows():
    val_phonemes.append(preprocess(row['text']))

In [10]:
train_data = train_data.assign(Phonemes=train_phonemes)
val_data = val_data.assign(Phonemes=val_phonemes)

In [11]:
train_data = train_data.drop('text', axis=1)
val_data = val_data.drop('text', axis=1)

In [12]:
train_data.head()

Unnamed: 0,filename,Phonemes
18538,CIenec_2055.wav,aa b t d ii0 t h aa + b d r ii0 + q w ii0 + uu...
41760,4RG7rS_3318.wav,l g AA ii0 h + l n h aa r d h + l aa + H q II0...
11015,BvhP3h_159.wav,b t d x l + l l m T b x + t $ uu0 f + l aa x T...
42713,Ua7qh3_321.wav,E z ii0 z + l n h aa r d h + H n aa + h + ii0 ...
30385,c7xpAQ_255.wav,aa n aa + E aa ii0 z k + w a l l h + n aa + E ...


In [13]:
import csv

train_data.to_csv('/kaggle/working/ready_train.txt', sep=' ', header=None, index=None, quoting=csv.QUOTE_NONNUMERIC)
val_data.to_csv('/kaggle/working/ready_val.txt', sep=' ', header=None, index=None, quoting=csv.QUOTE_NONNUMERIC)

### Preprocessing Audio files

In [14]:
#reading path for wav dataset
train_path = "/kaggle/input/egyptian-arabic-wavs/data/"
val_path = "/kaggle/input/egyptian-arabic-wavs/data/"

In [15]:
from utils.data import ArabDataset

train_dataset = ArabDataset('/kaggle/working/ready_train.txt', train_path)
test_dataset = ArabDataset('/kaggle/working/ready_val.txt', val_path)

100%|██████████| 34652/34652 [01:42<00:00, 338.68it/s]


Number of mel phonemes: 34648


100%|██████████| 8663/8663 [00:27<00:00, 320.39it/s]

Number of mel phonemes: 8663





## Load preprocessed data

In [8]:
import torch
from utils.data import ArabDatasetFromSerialized


train_data = torch.load('/kaggle/working/train_list.pt')
train_dataset = ArabDatasetFromSerialized(train_data[:10000])

test_data = torch.load('/kaggle/working/test_list.pt')
test_dataset = ArabDatasetFromSerialized(test_data[:2000])

# FineTuning model

In [9]:
def compute_accuracy(gate_out, gate_padded):
    # Apply sigmoid function to gate_out to get probabilities
    gate_prob = torch.sigmoid(gate_out)
    # Round the probabilities to get binary predictions (0 or 1)
    gate_pred = torch.round(gate_prob)
    # Compute accuracy by comparing predictions with targets
    correct = (gate_pred == gate_padded).sum().item()
    total = gate_padded.numel()
    accuracy = correct / total
    return accuracy

## Create empty state

In [None]:
checkpoints_path = "/kaggle/working/Checkpoint"
os.makedirs(checkpoints_path, exist_ok=True)
current_state = os.path.join(checkpoints_path, "states.pth")

if not os.path.exists(current_state):
    f = open(current_state, "w")
    f.write("{}")
    f.close()

In [10]:
import torch
import torch.nn.functional as F
import random
from utils.training import batch_to_device, save_states
from tqdm import tqdm

@torch.inference_mode()
def validate(model, test_loader, writer, device, n_iter):
    loss_sum = 0
    n_test_sum = 0

    model.eval()

    for batch in test_loader:
        text_padded, input_lengths, mel_padded, gate_padded, \
            output_lengths = batch_to_device(batch, device)

        y_pred = model(text_padded, input_lengths,
                       mel_padded, output_lengths,
                       torch.zeros_like(output_lengths))
        mel_out, mel_out_postnet, gate_pred, alignments = y_pred

        mel_loss = F.mse_loss(mel_out, mel_padded) + \
            F.mse_loss(mel_out_postnet, mel_padded)
        gate_loss = F.binary_cross_entropy_with_logits(gate_pred, gate_padded)
        loss = mel_loss + gate_loss

        loss_sum += mel_padded.size(0)*loss.item()
        n_test_sum += mel_padded.size(0)

    val_loss = loss_sum / n_test_sum

    idx = random.randint(0, mel_padded.size(0) - 1)
    mel_infer, *_ = model.infer(
        text_padded[idx:idx+1], input_lengths[idx:idx+1]*0, input_lengths[idx:idx+1])

    writer.add_sample(
        alignments[idx, :, :input_lengths[idx].item()],
        mel_out[idx], mel_padded[idx], mel_infer[0],
        output_lengths[idx], n_iter)

    writer.add_scalar('loss/val_loss', val_loss, n_iter)

    model.train()

    return val_loss

def training_loop(model,
                  optimizer,
                  train_loader,
                  test_loader,
                  writer,
                  device,
                  config,
                  n_epoch,
                  n_iter):

    model.train()
    net_config = {'n_mel_channels': 80,
              'n_symbols': 148,
              'padding_idx': 0,
              'symbols_embedding_dim': 384,
              'in_fft_n_layers': 6,
              'in_fft_n_heads': 1,
              'in_fft_d_head': 64,
              'in_fft_conv1d_kernel_size': 3,
              'in_fft_conv1d_filter_size': 1536,
              'in_fft_output_size': 384,
              'p_in_fft_dropout': 0.1,
              'p_in_fft_dropatt': 0.1,
              'p_in_fft_dropemb': 0.0,
              'out_fft_n_layers': 6,
              'out_fft_n_heads': 1,
              'out_fft_d_head': 64,
              'out_fft_conv1d_kernel_size': 3,
              'out_fft_conv1d_filter_size': 1536,
              'out_fft_output_size': 384,
              'p_out_fft_dropout': 0.1,
              'p_out_fft_dropatt': 0.1,
              'p_out_fft_dropemb': 0.0,
              'dur_predictor_kernel_size': 3,
              'dur_predictor_filter_size': 256,
              'p_dur_predictor_dropout': 0.1,
              'dur_predictor_n_layers': 2,
              'pitch_predictor_kernel_size': 3,
              'pitch_predictor_filter_size': 256,
              'p_pitch_predictor_dropout': 0.1,
              'pitch_predictor_n_layers': 2,
              'pitch_embedding_kernel_size': 3,
              'n_speakers': 1,
              'speaker_emb_weight': 1.0,
              'energy_predictor_kernel_size': 3,
              'energy_predictor_filter_size': 256,
              'p_energy_predictor_dropout': 0.1,
              'energy_predictor_n_layers': 2,
              'energy_conditioning': True,
              'energy_embedding_kernel_size': 3}
    for epoch in range(n_epoch, config.epochs):

        with tqdm(train_loader, desc=f"Epoch {epoch+1}/{config.epochs}", unit="batch") as t:
            for batch in t:

                text_padded, input_lengths, mel_padded, gate_padded, \
                    output_lengths = batch_to_device(batch, device)

                # Ensure mel_padded is not empty
                if mel_padded.numel() == 0:
                    continue 

                # Ensure mel_padded has at least two dimensions
                if mel_padded.dim() < 2:
                    continue

                y_pred = model(text_padded, input_lengths,
                               mel_padded, output_lengths,
                               torch.zeros_like(output_lengths))
                mel_out, mel_out_postnet, gate_out, _ = y_pred

                optimizer.zero_grad()

                # LOSS
                mel_loss = F.mse_loss(mel_out, mel_padded) + \
                    F.mse_loss(mel_out_postnet, mel_padded)
                gate_loss = F.binary_cross_entropy_with_logits(
                    gate_out, gate_padded)
                loss = mel_loss + gate_loss

                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), config.grad_clip_thresh)
                optimizer.step()
                
                accuracy = compute_accuracy(gate_out, gate_padded)

                # LOGGING
                t.set_postfix(loss=loss.item(), grad_norm=grad_norm.item(), accuracy=accuracy)
                if n_iter % config.n_save_states_iter == 0:
                    save_states(f'states.pth', model, optimizer, n_iter, epoch,
                    net_config, config)

                if n_iter % config.n_save_backup_iter == 0 and n_iter > 0:
                    save_states(f'states_{n_iter}.pth', model, optimizer, n_iter, epoch,
                    net_config, config)

                n_iter += 1

        # VALIDATE
        val_loss = validate(model, test_loader, writer, device, n_iter)
        print(f"Validation loss: {val_loss}")




In [11]:
class config(object):

    def __init__ (self, batch=8, epochs = 10, grad_clip_thresh = 1.0, learning_rate = 1.0e-5, weight_decay = 1.0e-8, max_step = 3000):
        self.epochs = epochs
        self.grad_clip_thresh = grad_clip_thresh
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.max_step = max_step
        self.batch = batch
        self.n_save_states_iter = 100
        self.n_save_backup_iter = 1000
        self.checkpoint_dir = "/kaggle/working/Checkpoint"
        self.pretrained_dir = "/kaggle/working/Checkpoint/states.pth"


In [12]:
config = config(batch=128)

In [13]:
#merges a list of samples to form a mini-batch of Tensor Used when using batched loading from a map-style dataset.
def text_mel_collate_fn(batch, pad_value=0):

    input_lens_sorted, input_sort_ids = torch.sort(
        torch.LongTensor([len(x[0]) for x in batch]),
        dim=0, descending=True)
    max_input_len = input_lens_sorted[0]

    num_mels = batch[0][1].size(0)
    max_target_len = max([x[1].size(1) for x in batch])

    text_ids_pad = torch.LongTensor(len(batch), max_input_len)
    mel_pad = torch.FloatTensor(len(batch), num_mels, max_target_len)
    gate_pad = torch.FloatTensor(len(batch), max_target_len)
    output_lengths = torch.LongTensor(len(batch))
    

    text_ids_pad.zero_(), mel_pad.fill_(pad_value), gate_pad.zero_()

    for i in range(len(input_sort_ids)):
        text_ids, mel = batch[input_sort_ids[i]]
        text_ids_pad[i, :text_ids.size(0)] = text_ids
        mel_pad[i, :, :mel.size(1)] = mel
        gate_pad[i, mel.size(1)-1:] = 1
        output_lengths[i] = mel.size(1)

    return text_ids_pad, input_lens_sorted, \
        mel_pad, gate_pad, output_lengths


In [14]:
from torch.utils.data import DataLoader

# dataloaders
train_loader = DataLoader(train_dataset,
                              batch_size=config.batch,
                              collate_fn=text_mel_collate_fn,
                              shuffle=True, drop_last=True,
                              sampler=None)

test_loader = DataLoader(test_dataset,
                             batch_size=config.batch, drop_last=False,
                             shuffle=False, collate_fn=text_mel_collate_fn)

In [26]:
from models.tacotron2.tacotron2_ms import Tacotron2MS
from utils.logging import TBLogger

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# construct model
model = Tacotron2MS(n_symbol=40)
model = model.to(device)
model.decoder.decoder_max_step = config.max_step

# optimizer
optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=1.0e-3,
                                  weight_decay=config.weight_decay)

# resume from existing checkpoint
n_epoch, n_iter = 0, 0


state_dicts = torch.load(config.pretrained_dir, map_location=device)
if 'model' in state_dicts:
    model.load_state_dict(state_dicts['model'])
if 'optim' in state_dicts:
      optimizer.load_state_dict(state_dicts['optim'])
if 'epoch' in state_dicts:
      n_epoch = state_dicts['epoch']
if 'iter' in state_dicts:
      n_iter = state_dicts['iter']


writer = TBLogger("checkpoints/exp_tc2_adv")
    # start training
training_loop(model,
                  optimizer,
                  train_loader,
                  test_loader,
                  writer,
                  device,
                  config,
                  n_epoch,
                  n_iter)

2024-05-14 00:25:38.135358: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-14 00:25:38.135469: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-14 00:25:38.264065: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Epoch 1/500: 100%|██████████| 312/312 [12:29<00:00,  2.40s/batch, accuracy=0.998, grad_norm=0.845, loss=0.943]


Validation loss: 1.0774190139770508


Epoch 2/500: 100%|██████████| 312/312 [12:37<00:00,  2.43s/batch, accuracy=0.998, grad_norm=0.632, loss=0.977]


Validation loss: 0.9532157344818115


Epoch 3/500: 100%|██████████| 312/312 [12:39<00:00,  2.43s/batch, accuracy=0.997, grad_norm=2.26, loss=1.11]  


Validation loss: 0.9032972569465637


Epoch 4/500: 100%|██████████| 312/312 [12:36<00:00,  2.43s/batch, accuracy=0.997, grad_norm=3.37, loss=1.05]  


Validation loss: 0.8719017767906189


Epoch 5/500: 100%|██████████| 312/312 [12:26<00:00,  2.39s/batch, accuracy=0.998, grad_norm=1.96, loss=0.73]  


Validation loss: 0.8272540588378906


Epoch 6/500:  21%|██▏       | 67/312 [02:47<10:14,  2.51s/batch, accuracy=0.997, grad_norm=0.816, loss=0.993]


KeyboardInterrupt: 

In [29]:
import matplotlib.pyplot as plt
import IPython
from models.tacotron2 import Tacotron2Wave

model = Tacotron2Wave("/kaggle/working/Checkpoint/states.pth")
model = model.cuda()

wave, mel_spec = model.tts("بيسبسيب", return_mel=True, denoise=0.005)

print("Audio output (Tacotron2)")
IPython.display.Audio(data=wave, rate=22050, normalize=True)

Audio output (Tacotron2)
