In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import torch
import torch.nn as nn 

from model.SpeechLP import SLP

from utils.Config import Config
from utils.MLS import MLSDataset
from utils.Trainer import Trainer

from torch.utils.data import DataLoader, random_split

In [10]:
Config.display()

Audio Settings:
  Sample Rate: 24000 Hz
  Min Audio Duration: 10 seconds
  Max Audio Duration: 20 seconds

Text Settings:
  Max Token : 64 tokens

Model Settings:
  Model Name: DiTTO-TTS
  Embedding Dim: 1472
  Num Layers: 1
  Attention Heads: 1

Training Settings:
  Batch Size: 8
  Learning Rate: 0.0001
  Betas: [0.9, 0.999]
  Epochs: 10
  Nb samples: 10000
  Device: cuda

Data Settings:
  Train path: C:/Cours-Sorbonne/M2/UE_DEEP/AMAL/Projet/data/mls_french_opus/mls_french_opus/train
  Test path: C:/Cours-Sorbonne/M2/UE_DEEP/AMAL/Projet/data/mls_french_opus/mls_french_opus/test
  Dev path: C:/Cours-Sorbonne/M2/UE_DEEP/AMAL/Projet/data/mls_french_opus/mls_french_opus/dev


In [11]:
train_set = MLSDataset(
    data_dir=Config.TRAIN_PATH,
    max_text_token_length=Config.MAX_TOKEN_LENGTH,
    sampling_rate=Config.SAMPLE_RATE,
)

val_set = MLSDataset(
    data_dir=Config.DEV_PATH,
    max_text_token_length=Config.MAX_TOKEN_LENGTH,
    sampling_rate=Config.SAMPLE_RATE,
)


test_set = MLSDataset(
    data_dir=Config.TEST_PATH,
    max_text_token_length=Config.MAX_TOKEN_LENGTH,
    sampling_rate=Config.SAMPLE_RATE,
)

train_loader = DataLoader(train_set, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=MLSDataset.collate_fn)
val_loader = DataLoader(val_set, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=MLSDataset.collate_fn)
test_loader = DataLoader(test_set, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=MLSDataset.collate_fn)

Tokenizing transcripts and saving results...


258213it [01:18, 3293.83it/s]


Tokenizing transcripts and saving results...


2416it [00:00, 2421.25it/s]


Tokenizing transcripts and saving results...


2426it [00:00, 2451.72it/s]


In [None]:
from tqdm import tqdm
for i, batch in enumerate(tqdm(train_loader)):
    try:
        # Process the batch as normal
        text = {key: val.to(Config.DEVICE) for key, val in batch["text"].items()}
    except UnicodeDecodeError as e:
        print(f"[ERROR] UnicodeDecodeError in batch {i}: {e}")
        break
    if i == 10:
        break
    

In [None]:
model = SLP(Config.MAX_AUDIO_DURATION, Config.NHEAD ,Config.NUM_LAYERS).to(Config.DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW

trainer = Trainer()
trainer.set_model(model, name="SLP")\
    .set_criterion(criterion)\
    .set_optimizer(optimizer)\
    .fit(
        train_data=train_loader, validation_data=val_loader, 
        epochs=Config.EPOCHS, learning_rate=Config.LEARNING_RATE, checkpoint_interval=1        
    )