In [None]:
!pip install transformers datasets soundfile accelerate speechbrain==0.5.16

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting speechbrain==0.5.16
  Downloading speechbrain-0.5.16-py3-none-any.whl.metadata (23 kB)
Collecting hyperpyyaml (from speechbrain==0.5.16)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9->speechbrain==0.5.16)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Co

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_dataset, Audio
dataset = load_dataset("keithito/lj_speech")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'audio', 'file', 'text', 'normalized_text'],
        num_rows: 13100
    })
})

In [54]:
dataset = dataset["train"]
len(dataset)

13100

In [55]:
# Calculate the number of rows for half the dataset
half_size = len(dataset) //5

# Select the first half of the dataset
dataset = dataset.select(range(half_size))

print(dataset)

Dataset({
    features: ['id', 'audio', 'file', 'text', 'normalized_text'],
    num_rows: 2620
})


We are using just the 1/16th of the data

In [56]:
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [57]:
from transformers import SpeechT5Processor

checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)


In [58]:
tokenizer = processor.tokenizer

In [None]:
!pip install librosa soundfile



In [None]:
dataset[2:5]

{'id': ['LJ001-0003', 'LJ001-0004', 'LJ001-0005'],
 'audio': [{'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0003.wav',
   'array': array([-0.00404182, -0.00031247, -0.00158895, ...,  0.00022209,
           0.00031626, -0.00021122]),
   'sampling_rate': 16000},
  {'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0004.wav',
   'array': array([ 1.84258504e-04, -6.89462904e-05, -2.43774717e-04, ...,
          -5.48143697e-04, -6.62148464e-04, -6.06632268e-04]),
   'sampling_rate': 16000},
  {'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0005.wav',
   'array': array([0.00032222, 0.00043797, 0.00043948, ..., 0.00040887, 0.00078317,
          0.        ]),
   'sampling_rate': 16000}

Let's normalize the dataset, create a column called "normalized_text"

In [59]:
def extract_all_chars(batch):
    all_text = " ".join(batch["text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

Map:   0%|          | 0/2620 [00:00<?, ? examples/s]

In [60]:
dataset_vocab - tokenizer_vocab

{' ', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '£'}

In [61]:
def extract_all_chars(batch):
    all_text = " ".join(batch["normalized_text"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}


vocabs = dataset.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=dataset.column_names,
)

dataset_vocab = set(vocabs["vocab"][0])
tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}

Map:   0%|          | 0/2620 [00:00<?, ? examples/s]

In [62]:
dataset_vocab - tokenizer_vocab

{' '}

In [63]:
replacements = [
    ("â", "a"),  # Long a
    ("ç", "ch"),  # Ch as in "chair"
    ("ğ", "gh"),  # Silent g or slight elongation of the preceding vowel
    ("ı", "i"),   # Dotless i
    ("î", "i"),   # Long i
    ("ö", "oe"),  # Similar to German ö
    ("ş", "sh"),  # Sh as in "shoe"
    ("ü", "ue"),  # Similar to German ü
    ("û", "u"),   # Long u
]

def cleanup_text(inputs):
    for src, dst in replacements:
        inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
    return inputs

dataset = dataset.map(cleanup_text)

In [64]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name,
    run_opts={"device": device},
    savedir=os.path.join("/tmp", spk_model_name),
)


def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [65]:
def prepare_dataset(example):
    audio = example["audio"]

    example = processor(
        text=example["normalized_text"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

In [66]:
processed_example = prepare_dataset(dataset[0])
list(processed_example.keys())

['input_ids', 'labels', 'speaker_embeddings']

In [67]:
processed_example["speaker_embeddings"].shape

(512,)

In [68]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)

In [69]:
def is_not_too_long(input_ids):
    input_length = len(input_ids)
    return input_length < 200

dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
len(dataset)

2620

In [70]:
train_test = dataset.train_test_split(test_size=0.3)  # 70% train, 30% temp
val_test = train_test["test"].train_test_split(test_size=0.5)  # Split temp into 15% val, 15% test

dataset = {
    "train": train_test["train"],
    "validation": val_test["train"],
    "test": val_test["test"]
}
print(dataset)

{'train': Dataset({
    features: ['input_ids', 'labels', 'speaker_embeddings'],
    num_rows: 1834
}), 'validation': Dataset({
    features: ['input_ids', 'labels', 'speaker_embeddings'],
    num_rows: 393
}), 'test': Dataset({
    features: ['input_ids', 'labels', 'speaker_embeddings'],
    num_rows: 393
})}


In [71]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class TTSDataCollatorWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
        label_features = [{"input_values": feature["labels"]} for feature in features]
        speaker_features = [feature["speaker_embeddings"] for feature in features]

        # collate the inputs and targets into a batch
        batch = processor.pad(
            input_ids=input_ids, labels=label_features, return_tensors="pt"
        )

        # replace padding with -100 to ignore loss correctly
        batch["labels"] = batch["labels"].masked_fill(
            batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
        )

        # not used during fine-tuning
        del batch["decoder_attention_mask"]

        # round down target lengths to multiple of reduction factor
        if model.config.reduction_factor > 1:
            target_lengths = torch.tensor(
                [len(feature["input_values"]) for feature in label_features]
            )
            target_lengths = target_lengths.new(
                [
                    length - length % model.config.reduction_factor
                    for length in target_lengths
                ]
            )
            max_length = max(target_lengths)
            batch["labels"] = batch["labels"][:, :max_length]

        # also add in the speaker embeddings
        batch["speaker_embeddings"] = torch.tensor(speaker_features)

        return batch

In [72]:
data_collator = TTSDataCollatorWithPadding(processor=processor)

In [73]:
from transformers import SpeechT5ForTextToSpeech

model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)

In [74]:
from functools import partial

# disable cache during training since it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(model.generate, use_cache=True)

In [29]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="speecht5_finetuned_madhav",  # change to a repo name of your choice
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    warmup_steps=100,
    max_steps=229,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    greater_is_better=False,
    label_names=["labels"],
    push_to_hub=False,
)



In [27]:
num_epochs = 4  # Change this as needed
dataset_size = len(dataset["train"])  # Get number of training samples
effective_batch_size = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps

max_steps = (num_epochs * dataset_size) // effective_batch_size
print(f"Calculated max_steps: {max_steps}")


Calculated max_steps: 229


In [30]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_test["train"],
    eval_dataset=val_test["train"],
    data_collator=data_collator,
    tokenizer=processor,
)

  trainer = Seq2SeqTrainer(


In [31]:
trainer.train()

Step,Training Loss,Validation Loss
100,0.4555,0.413252
200,0.428,0.39303




TrainOutput(global_step=229, training_loss=0.47912290679315295, metrics={'train_runtime': 488.0467, 'train_samples_per_second': 15.015, 'train_steps_per_second': 0.469, 'total_flos': 879196245668928.0, 'train_loss': 0.47912290679315295, 'epoch': 4.017429193899782})

In [49]:
#Load Dataset from scratch for testing.

In [50]:
import os
import torch
import numpy as np
import librosa
import soundfile as sf
from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan
from pesq import pesq  # Make sure to install pypesq for PESQ calculation
import torchaudio
import torchaudio.transforms as T

# Load the models
tts_model = SpeechT5ForTextToSpeech.from_pretrained("speecht5_finetuned_madhav/checkpoint-229")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Transcription function for WER computation using Wav2Vec2
def transcribe_audio(file_path):
    from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Resample if the sample rate is not 16000
    if sample_rate != 16000:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Transcribe the audio
    inputs = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000)
    with torch.no_grad():
        logits = model(input_values=inputs.input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)

    return transcription[0]

# Function to compute WER (Word Error Rate) manually
def compute_wer(reference_audio_path, predicted_audio_path):
    reference_transcription = transcribe_audio(reference_audio_path)
    predicted_transcription = transcribe_audio(predicted_audio_path)

    # Compute WER manually (Levenshtein distance)
    reference_words = reference_transcription.split()
    predicted_words = predicted_transcription.split()

    distance = levenshtein_distance(reference_words, predicted_words)
    wer_score = distance / len(reference_words)
    return wer_score

# Levenshtein distance for computing WER
def levenshtein_distance(ref, pred):
    m = len(ref) + 1
    n = len(pred) + 1
    matrix = np.zeros((m, n))
    for i in range(m):
        matrix[i][0] = i
    for j in range(n):
        matrix[0][j] = j
    for i in range(1, m):
        for j in range(1, n):
            cost = 0 if ref[i-1] == pred[j-1] else 1
            matrix[i][j] = min(matrix[i-1][j] + 1,
                               matrix[i][j-1] + 1,
                               matrix[i-1][j-1] + cost)
    return matrix[m-1][n-1]

# Function to compute Mel Cepstral Distortion (MCD)
import librosa.util

def compute_mcd(ref_wav, pred_wav):
    ref, _ = librosa.load(ref_wav, sr=16000)
    pred, _ = librosa.load(pred_wav, sr=16000)

    mel_ref = librosa.feature.melspectrogram(y=ref, sr=16000, n_mels=23)
    mel_pred = librosa.feature.melspectrogram(y=pred, sr=16000, n_mels=23)

    # Align shapes by trimming or padding
    min_frames = min(mel_ref.shape[1], mel_pred.shape[1])
    mel_ref = mel_ref[:, :min_frames]
    mel_pred = mel_pred[:, :min_frames]

    # Compute MCD
    mcd = np.sqrt(np.sum((mel_ref - mel_pred) ** 2) / mel_ref.size)
    return mcd

# Function to compute PESQ score
def compute_pesq(ref_wav, pred_wav):
    ref, _ = librosa.load(ref_wav, sr=16000, mono=True)
    pred, _ = librosa.load(pred_wav, sr=16000, mono=True)

    # PESQ requires float32 numpy arrays
    ref = ref.astype(np.float32)
    pred = pred.astype(np.float32)

    # Use narrowband PESQ (for 8kHz) or wideband PESQ (for 16kHz)
    score = pesq(16000, ref, pred, 'wb')  # 'wb' = Wideband PESQ for 16kHz audio

    return score


from transformers import SpeechT5Processor

# Load processor (same one used in training)
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

def evaluate_tts_model(test_set):
    total_wer = 0
    total_mcd = 0
    total_pesq = 0
    num_samples = len(test_set)

    results = []

    for idx, sample in enumerate(test_set):
        text_input = sample["normalized_text"]
        reference_audio_path = sample["file"]  # Get path to reference .wav file

        # Generate input_ids for the model
        inputs = processor(text=text_input, return_tensors="pt")

        # Use SpeechBrain to compute speaker embeddings
        speaker_embeddings = create_speaker_embedding(sample["audio"]["array"])
        speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)  # Ensure correct shape

        # Generate speech using the model
        generated_speech = tts_model.generate_speech(
            inputs["input_ids"], speaker_embeddings, vocoder=vocoder
        )

        # Save generated speech
        predicted_audio_path = f"predicted_{sample['id']}.wav"
        sf.write(predicted_audio_path, generated_speech.numpy(), 16000)

        # Compute evaluation metrics
        wer_score = compute_wer(reference_audio_path, predicted_audio_path)
        mcd_score = compute_mcd(reference_audio_path, predicted_audio_path)
        pesq_score = compute_pesq(reference_audio_path, predicted_audio_path)

        # Store results
        results.append({"ID": sample["id"], "WER": wer_score, "MCD": mcd_score, "PESQ": pesq_score})
        total_wer += wer_score
        total_mcd += mcd_score
        total_pesq += pesq_score

        print(f"Processed sample {idx + 1}/{num_samples}")

    # Compute average scores
    avg_wer = total_wer / num_samples
    avg_mcd = total_mcd / num_samples
    avg_pesq = total_pesq / num_samples

    print(f"Average WER: {avg_wer:.4f}")
    print(f"Average MCD: {avg_mcd:.4f}")
    print(f"Average PESQ: {avg_pesq:.4f}")

    return results, avg_wer, avg_mcd, avg_pesq
test_set = list(val_test["test"])[:10]
results, avg_wer, avg_mcd, avg_pesq = evaluate_tts_model(test_set)

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/50.6M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/162 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 1/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 2/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 3/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 4/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 5/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 6/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 7/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 8/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 9/10


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed sample 10/10
Average WER: 0.0222
Average MCD: 6.6273
Average PESQ: 1.0897


# Inference

In [51]:
from transformers import SpeechT5ForTextToSpeech
model = SpeechT5ForTextToSpeech.from_pretrained(
    "speecht5_finetuned_madhav/checkpoint-229"
)

In [75]:
example = val_test["test"][1]
speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)

In [76]:
text = "A team of scientists has sent a camera down a seam in the Pacific Ocean in an attempt to see what marine life there is in the deep Pacific Ocean - and so far they've seen fish, jellyfish, mud volcanoes and deep sea coral, among other interesting creatures."

In [77]:
import re

# Dictionary for number-to-word conversion
number_words = {
    0: "zero", 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
    10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen",
    17: "seventeen", 18: "eighteen", 19: "nineteen", 20: "twenty", 30: "thirty", 40: "forty", 50: "fifty",
    60: "sixty", 70: "seventy", 80: "eighty", 90: "ninety", 100: "hundred", 1000: "thousand"
}

def number_to_words(number):
    if number < 20:
        return number_words[number]
    elif number < 100:
        tens, unit = divmod(number, 10)
        return number_words[tens * 10] + ("-" + number_words[unit] if unit else "")
    elif number < 1000:
        hundreds, remainder = divmod(number, 100)
        return (number_words[hundreds] + " hundred" if hundreds > 0 else "") + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000:
        thousands, remainder = divmod(number, 1000)
        return number_to_words(thousands) + " thousand" + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000000:
        millions, remainder = divmod(number, 1000000)
        return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000000000:
        billions, remainder = divmod(number, 1000000000)
        return number_to_words(billions) + " billion" + (" " + number_to_words(remainder) if remainder else "")
    else:
        return str(number)

def replace_numbers_with_words(text):
    def replace(match):
        number = int(match.group())
        return number_to_words(number)

    # Replace numbers with words
    result = re.sub(r'\b\d+\b', replace, text)

    return result


In [78]:
def normalize_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation (except apostrophes)
    text = re.sub(r'[^\w\s\']', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

In [79]:
# Function to clean up text using the replacement pairs
def cleanup_text(text):
    for src, dst in replacements:
        text = text.replace(src, dst)
    return text

In [80]:
converted_text = replace_numbers_with_words(text)
cleaned_text = cleanup_text(converted_text)
final_text = normalize_text(cleaned_text)
final_text

"a team of scientists has sent a camera down a seam in the pacific ocean in an attempt to see what marine life there is in the deep pacific ocean and so far they've seen fish jellyfish mud volcanoes and deep sea coral among other interesting creatures"

In [81]:
def split_text(text, max_length=15):
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_length):
        chunk = " ".join(words[i:i+max_length])
        chunks.append(chunk)
    return chunks

In [82]:
text_chunks = split_text(final_text)

In [83]:
from transformers import SpeechT5HifiGan
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
for part_idx, chunk in enumerate(text_chunks):
    # Convert text to input tensors
    inputs = processor(text=chunk, return_tensors="pt")

    # Generate speech using the model and vocoder
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    # Save each chunk separately
    filename = f'predicted_summary_part{part_idx}.wav'
    sf.write(filename, speech.numpy(), 16000)

    print(f"Generated speech for part {part_idx}: {chunk[:30]}...")

Generated speech for part 0: a team of scientists has sent ...
Generated speech for part 1: in an attempt to see what mari...
Generated speech for part 2: and so far they've seen fish j...
Generated speech for part 3: interesting creatures...


In [85]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [86]:
from pydub import AudioSegment

# Initialize an empty audio segment
final_audio = AudioSegment.silent(duration=0)

# Combine all generated speech parts
for part_idx in range(len(text_chunks)):  # Number of parts generated
    filename = f'predicted_summary_part{part_idx}.wav'
    audio_segment = AudioSegment.from_wav(filename)
    final_audio += audio_segment  # Append each part to the final output

# Export the merged audio to a single .wav file
final_audio.export("final_predicted_summary.wav", format="wav")

print("Final combined speech saved as 'final_predicted_summary.wav'")


Final combined speech saved as 'final_predicted_summary.wav'


In [88]:
!zip -r output.zip /content/speecht5_finetuned_madhav/checkpoint-229

  adding: content/speecht5_finetuned_madhav/checkpoint-229/ (stored 0%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/trainer_state.json (deflated 67%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/optimizer.pt (deflated 8%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/rng_state.pth (deflated 25%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/generation_config.json (deflated 34%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/training_args.bin (deflated 51%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/model.safetensors (deflated 7%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/tokenizer_config.json (deflated 78%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/scheduler.pt (deflated 56%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/spm_char.model (deflated 42%)
  adding: content/speecht5_finetuned_madhav/checkpoint-229/added_tokens.json (deflated 13%)
  adding: content

In [90]:
from google.colab import files
files.download("output.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [91]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [92]:
!mv output.zip /content/drive/MyDrive/