In [None]:
from google.colab import drive
drive.mount('/content/drive')
# # Install nessecary libraries
!pip install transformers>=4.48.0
!pip install speechbrain


## Dataset preparation

In [None]:
import os

# Convert .m4a audio to wav
def save_wav(path, f_path):
  os.makedirs(f_path, exist_ok=True)

  for i, m4a_file in enumerate(sorted(os.listdir(path))):
      if m4a_file.endswith(".m4a"):
          input_path = os.path.join(path, m4a_file)
          out = ['1']
          for j,c in enumerate(m4a_file):
            # Get the number between paranthleses for renaming the wav to just number
            if c=="(":
              out = m4a_file[j+1]
              for temp in m4a_file[j+2:]:
                if temp==')':
                  out = [out]
                  break
                out = out+temp

              if not out:
                out = [m4a_file[j+1]]

          output_path = os.path.join(f_path, f"{'000000'+out[0]}.wav")


          command = f'ffmpeg -i "{input_path}" -ac 1 -ar 16000 "{output_path}"'
          os.system(command)

          print(f"Transformed: {m4a_file} -> {os.path.basename(output_path)}")

f_path = "drive/My Drive/audiobooks/wav/"
path = "drive/My Drive/audiobooks/one_sentence/"
save_wav(path, f_path)
f_path = "drive/My Drive/audiobooks/wav/"
path = "drive/My Drive/audiobooks/two_sentence/"
save_wav(path, f_path)


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
import csv

# Convert data to dataframe - text (transcript), audio
def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def split_sentences(text, language='czech'):
    return sent_tokenize(text, language=language)

def group_sentences(sentences, group_size):
    return [' '.join(sentences[i:i+group_size]) for i in range(0, len(sentences), group_size)]

def save_metadata(segments, output_path='drive/My Drive/audiobooks/metadata.csv',len_of_the_first_segments=0):
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, delimiter='|')
        for idx, segment in enumerate(segments):
            wav_filename = f"{f'000000{idx+1+len_of_the_first_segments}'}.wav" # I filtered out 40 between 133 and 172.wav
            writer.writerow([wav_filename, segment])
    print(f"metadata.csv uloženo s {len(segments)} záznamy.")
def concatenate_non_uppercase(lst):
    if not lst:
        return []

    result = [lst[0]]
    # concatenate to previous string if not uppercase the next letter
    for s in lst[1:]:
        if s and not s[0].isupper():
            result[-1] += ' ' + s
        else:
            result.append(s)
    return result

txt_path = "drive/My Drive/audiobooks/one_sentence/transcript_part_1.txt"
group_size = 1         # one or mor sentences
text = load_text(txt_path)

# Replace some symbols
text = text.replace("\n", " ")
text = text.replace('“','')
text = text.replace('„','')
text = text.replace('...','')

print(text)
sentences = split_sentences(text)
sentences_copy=concatenate_non_uppercase(sentences)
segments = group_sentences(sentences_copy, group_size)
save_metadata(segments,output_path='drive/My Drive/audiobooks/metadata_one.csv')
print(len(segments))
len_of_the_first_segments = len(segments)
segments

In [None]:
txt_path = "drive/My Drive/audiobooks/two_sentence/transcript_2_vety.txt"
group_size = 2         # one or mor sentences
text = load_text(txt_path)

# Replace some symbols
text = text.replace("\n", " ")
text = text.replace('“','')
text = text.replace('„','')
text = text.replace('...','')

print(text)
sentences = split_sentences(text)
sentences_copy=concatenate_non_uppercase(sentences)
segments = group_sentences(sentences_copy, group_size)
save_metadata(segments,output_path='drive/My Drive/audiobooks/metadata_two.csv',len_of_the_first_segments=len_of_the_first_segments)
segments

In [None]:
from transformers import SpeechT5Processor
# Load Processor (tokenizor) from SpeechT5
checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)

# Print vocab of pretrained model
tokenizer = processor.tokenizer
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}
tokenizer_vocab

In [None]:
import pandas as pd
metadata_path = "drive/My Drive/audiobooks/metadata_one.csv"
df1 = pd.read_csv(metadata_path, delimiter='|',header=None)
df1.columns = ["filename", "text"]

metadata_path = "drive/My Drive/audiobooks/metadata_two.csv"
df2 = pd.read_csv(metadata_path, delimiter='|',header=None)
df2.columns = ["filename", "text"]
df = pd.concat([df1, df2], ignore_index=True)
df


In [None]:
# Print characters that are in text but not in vocab of tokenizer
l = []
for char in df["text"]:
  for c in char:
     l.append(c)
print("All chars: ",set(l))
print("Not in vocab: ",set(l)-tokenizer_vocab)

In [None]:
#Map czech letters to english transcription

map_czech_letters = {
    'í': 'ee', # see
    'ř': 'rz', # nothing similar in english
    'Ř': 'RZ', # nothing similar in english
    'ý': 'yy', #
    'ě': 'ye', # yes
    'č': 'cz', # better would be 'ch' but 'ch' is letter in czech
    'Č': 'CZ', # better would be 'ch' but 'ch' is letter in czech
    'ů': 'uu',
    'ú': 'uu',
    'š': 'sh', # shoe
    'Š':'Sh',
    'ď': 'du', # Duke
    'ť': 'tu', # tune
    'ň': 'ny', # canyon
    'ž': 'zh', # zs would be also possible
    'Ž': 'ZH', #
    'á': 'aa',
    'ö': 'o', # I read ö as o in audio (There was name Jörgen)
    '«':'',
    '–':'-',
    '»':'',
}
def replace_czech_letters(text, mapping):
    return ''.join(mapping.get(c, c) for c in text)
df["text_phonetic"] = df["text"].apply(lambda x: replace_czech_letters(x, map_czech_letters))
df

In [None]:
# Print again if all characters are in vocab
l = []
for char in df["text_phonetic"]:
  for c in char:
     l.append(c)
print(set(l))
print(set(l)-tokenizer_vocab)

In [None]:
import numpy as np

# Create audio in dataset of hugging face style so we can convert it
audio_dir = "drive/My Drive/audiobooks/wav"
audio_list = []
transcriptions = []

for _, row in df.iterrows():
    filename = row["filename"]
    text = row["text_phonetic"]
    filename = filename.replace(".wav", "")
    audio_path = os.path.join(audio_dir, f"{filename}.wav")

    sampling_rate, array = wavfile.read(audio_path)
    array = array.astype(np.float16)

    audio_list.append({
        "path": audio_path,
        "array": array,
        "sampling_rate": sampling_rate
    })
    transcriptions.append(text)

dataset_dict = {
    "audio": audio_list,
    "transcription": transcriptions
}

dataset_dict

In [None]:
import speechbrain
import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier
# Other pretrained model needed for creating embedding for my voice
model_name = "speechbrain/spkrec-xvect-voxceleb"


device = "cuda" if torch.cuda.is_available() else "cpu"

classifier = EncoderClassifier.from_hparams(
    source=model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", model_name)
)
wav_dir = "drive/My Drive/audiobooks/wav"      # např. "data/wavs"
embeddings = torch.full((1,512), torch.nan)
# Generate embedding - just one is enough for one voice
for i,fname in enumerate(os.listdir(wav_dir)):
    if fname.endswith(".wav"):
        path = os.path.join(wav_dir, fname)

        # Načti zvuk
        signal, fs = torchaudio.load(path)
        if fs != 16000:
            signal = torchaudio.functional.resample(signal, fs, 16000)

        # Přesuň na zařízení
        signal = signal.to(device)

        # Vytvoř embedding
        with torch.no_grad():
            embedding = classifier.encode_batch(signal)
            embedding = torch.nn.functional.normalize(embedding, dim=2)
            embedding = embedding.squeeze().cpu()
            embedding = embedding.to(torch.float16)
            embeddings[i] = embedding
            break
embeddings = embeddings.repeat(len(df),1)
print(embeddings.shape)
dataset_dict["embeddings"] = embeddings


In [14]:
from datasets import Dataset
# Create hugging face dataset

dataset_dict_copy={}
dataset_dict_copy = {
    "input_ids": dataset_dict["transcription"],  # list of lists
    "labels":  dataset_dict["audio"], # Extract just the audio arrays
    "speaker_embeddings": dataset_dict["embeddings"]
}
dataset = Dataset.from_dict(dataset_dict_copy)

In [None]:
# Preprocess (tokenized, convert to spectrogram) dataset
def prepare_dataset(example):
    audio = example["labels"]
    speaker_emb = example["speaker_embeddings"]

    example = processor(
        text=example["input_ids"],
        audio_target=audio["array"],
        sampling_rate=16000,
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = torch.Tensor(example["labels"][0])
    example["speaker_embeddings"] = speaker_emb
    return example
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

In [None]:
# Filter sequences with more than 200 tokens (model not ok with that)
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < 200

dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
len(dataset)

In [17]:
# Create test (=validation for this project) set
dataset = dataset.train_test_split(test_size=0.1)

## Training

In [18]:
from dataclasses import dataclass

# Batch creator
@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(self, features):
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids, labels=label_features, return_tensors="pt"
        )

        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        del batch["decoder_attention_mask"]

        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(feature["input_values"]) for feature in label_features]
            )
            target_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch
data_collator = TTSDataCollatorWithPadding(processor)

In [None]:
# Finally load pretrained model which we are going to fine tune
from transformers import SpeechT5ForTextToSpeech
from functools import partial


model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)

# disable cache during training
model.config.use_cache = False
model.generate = partial(model.generate, use_cache=True)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Set some parameters of the trainer

trainer = Seq2SeqTrainer(
    args=Seq2SeqTrainingArguments(output_dir="drive/My Drive/audiobooks",

    per_device_train_batch_size=4,
    learning_rate=1e-4,
    max_steps=2100,
    gradient_checkpointing=True,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=4,
    save_steps=2100,
    eval_steps=100,
    logging_steps=25,
    load_best_model_at_end=True,
    greater_is_better=False,
    report_to=[],
    remove_unused_columns=False,
    label_names=["labels"],),
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    tokenizer=processor,
)

In [None]:
# Finally train (fine tune)
trainer.train()

In [None]:
# Test model on some sentences if there are similar to my voice

final_text = "Chvílemi zněla píšťala tak, jako by krysař přece ještě někoho volal."
final_text = replace_czech_letters(final_text, map_czech_letters)
inputs = processor(text=final_text, return_tensors="pt")

In [55]:
from transformers import SpeechT5HifiGan # Needed for transfering spectrogram to audio

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

model = SpeechT5ForTextToSpeech.from_pretrained( # Load pretrained model
    "drive/My Drive/audiobooks/checkpoint-2000"
)

example = dataset["test"]
speaker_embeddings = torch.tensor(example["speaker_embeddings"])[:1]
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) # Generate my voice speech


## Test: AI generated voice x My reference voice

In [None]:
from IPython.display import Audio

# My voice created by neural network
Audio(speech.numpy(), rate=16000)

In [None]:
from IPython.display import Audio
from scipy.io import wavfile

rate, data = wavfile.read("drive/My Drive/audiobooks/label.wav")
# Reference audio
Audio(data, rate=rate)
