 **Environment Setup**

Install Required Libraries

In [None]:
!pip install transformers datasets torch torchaudio


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:0

**Model and Dataset Preparation**

Load the Pre-trained Model

In [None]:
from transformers import SpeechT5ForTextToSpeech, SpeechT5Processor

# Load pre-trained model and processor
model = SpeechT5ForTextToSpeech.from_pretrained('microsoft/speecht5_tts')
processor = SpeechT5Processor.from_pretrained('microsoft/speecht5_tts')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]



Dataset

In [None]:
import pandas as pd
from datasets import Dataset

# Load your dataset from Excel file using pd.read_excel
df = pd.read_excel('/content/drive/MyDrive/DATASET FOR TTS/TTS-English.xlsx')
dataset = Dataset.from_pandas(df)

Dataset Preprocessing

In [None]:
def preprocess_function(examples):
    # Ensure 'Text Example' is always a list of strings
    texts = examples['Text Example']

    # Check if 'texts' is already a list of strings
    if all(isinstance(t, str) for t in texts):
        pass  # Already in the correct format

    # If 'texts' is not a list of strings, attempt to convert it
    else:
        # Handle potential nested lists or empty lists
        texts = [str(t[0]) if isinstance(t, list) and len(t) > 0 else str(t) for t in texts]

        # If there are any non-string elements after conversion, raise an error with specifics
        non_string_elements = [t for t in texts if not isinstance(t, str)]
        if non_string_elements:
            raise ValueError(f"Non-string elements found in 'Text Example' after conversion: {non_string_elements}")

    inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True)

    # Convert each element in 'phonetic_transcription' to a string
    phonetic_transcriptions = [str(x) for x in examples['phonetic_transcription']]

    # Tokenize and add padding/truncation to 'phonetic_transcription'
    labels = processor.tokenizer(
        phonetic_transcriptions,
        return_tensors="pt",
        padding=True,  # Add padding to make all sequences the same length
        truncation=True  # Truncate sequences exceeding the maximum length
    )['input_ids']

    inputs['labels'] = labels
    return inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

**Fine-tuning the Model**

Set Training Arguments

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./speechT5_finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,  # You can increase epochs if needed
    logging_dir='./logs',
    learning_rate=5e-5,
    save_steps=500,  # Saves the model every 500 steps
    evaluation_strategy="steps"
)




Initialize Trainer

In [None]:
import wandb
from transformers import Trainer, TrainingArguments

# Initialize WandB
wandb.login()

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    report_to="wandb",  # Enables logging to WandB
    ...

)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Optional: you can create a separate evaluation set
    tokenizer=processor.tokenizer
)

# Start training
trainer.train()


**Evaluation**

Generate Speech from Text

In [None]:
!pip install --upgrade torchaudio
!ffmpeg -i input_file.mp3 -ar 24000 -ac 1 -f s16 output_file.wav

Collecting torchaudio
  Downloading torchaudio-2.5.0-cp310-cp310-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting torch==2.5.0 (from torchaudio)
  Downloading torch-2.5.0-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.0->torchaudio)
  Downloading 

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [None]:
import torchaudio
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech

# Test input: A sentence with technical terms
test_sentence = "The API for CUDA is very efficient."

# Load the processor and model
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

# Pass the sentence as a keyword argument 'text'
inputs = processor(text=test_sentence, return_tensors="pt")

# Create dummy speaker embeddings (replace with actual embeddings if available)
speaker_embeddings = torch.zeros((1, 512))  # Make sure this shape matches the model's expectations

# Generate speech, including speaker embeddings
with torch.no_grad():  # Disable gradient calculation for inference
    generated_speech = model.generate(
        **inputs, speaker_embeddings=speaker_embeddings
    )

# Check the shape of the generated speech
print("Shape of generated speech:", generated_speech.shape)

# Check the data type of the generated speech
print("Data type of generated speech:", generated_speech.dtype)

# Convert to int16 format and scale appropriately
#generated_speech = generated_speech.cpu().type(torch.int16) #No need to cast to int16

# Reshape the tensor if necessary
# Assuming generated_speech has shape (100, 80) and you want mono audio
#generated_speech = generated_speech.view(-1) # Reshape to a 1D tensor
# Instead of view(-1), Reshape using view(1, -1) to add the channel dimension:

generated_speech = generated_speech.view(1, -1)  # Reshape to [channels, time]

# Save the generated speech with the appropriate sample rate
torchaudio.save(
    'generated_speech.wav',
    generated_speech.type(torch.float32),  # Save as float32
    24000,
    format='wav',
    #bits_per_sample=16,
    channels_first=True  # Set channels_first to True
)

print("Audio saved as 'generated_speech.wav'")



Shape of generated speech: torch.Size([104, 80])
Data type of generated speech: torch.float32
Audio saved as 'generated_speech.wav'


In [None]:
!pip install git+https://github.com/suno-ai/bark

Collecting git+https://github.com/suno-ai/bark
  Cloning https://github.com/suno-ai/bark to /tmp/pip-req-build-p0vtnwnr
  Running command git clone --filter=blob:none --quiet https://github.com/suno-ai/bark /tmp/pip-req-build-p0vtnwnr
  Resolved https://github.com/suno-ai/bark to commit f4f32d4cd480dfec1c245d258174bc9bde3c2148
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting boto3 (from suno-bark==0.0.1a0)
  Downloading boto3-1.35.43-py3-none-any.whl.metadata (6.7 kB)
Collecting encodec (from suno-bark==0.0.1a0)
  Downloading encodec-0.1.1.tar.gz (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting funcy (from suno-bark==0.0.1a0)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting botocore<1

Generate Audio from Text

In [None]:
from bark import generate_audio, SAMPLE_RATE
import numpy as np
import soundfile as sf

# Define the text you want to convert into speech
text_prompt = "Hello, this is a test using the Bark library to generate speech!"

# Generate audio from text
audio_array = generate_audio(text_prompt)

# Save the audio to a WAV file
sf.write("output.wav", audio_array, SAMPLE_RATE)

print("Audio generated and saved as output.wav")




text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

  checkpoint = torch.load(ckpt_path, map_location=device)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



coarse_2.pt:   0%|          | 0.00/3.93G [00:00<?, ?B/s]

fine_2.pt:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)
Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [00:01<00:00, 64.2MB/s]
100%|██████████| 527/527 [02:16<00:00,  3.85it/s]
100%|██████████| 27/27 [11:42<00:00, 26.01s/it]


Audio generated and saved as output.wav


In [None]:
import IPython.display as ipd
ipd.Audio("output.wav")

In [None]:
import pandas as pd
import re
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the Excel file (update the file path if needed)
file_path = '/content/drive/MyDrive/DATASET FOR TTS/TTS-English.xlsx'
df = pd.read_excel(file_path)

# Extract the 'Text Example' column as a list
dataset = df['Text Example'].tolist()

# List of technical terms to look for in the dataset
technical_terms = ['API', 'CUDA', 'TensorFlow', 'PyTorch']

# Function to extract sentences with technical terms
def extract_sentences_with_terms(dataset, technical_terms):
    extracted_sentences = []
    for sentence in dataset:
        # Convert sentence to string if it's not already
        if not isinstance(sentence, str):
            sentence = str(sentence)
        for term in technical_terms:
            if re.search(r'\b' + term + r'\b', sentence):
                extracted_sentences.append(sentence)
                break
    return extracted_sentences

# Extract sentences containing technical terms
extracted_sentences = extract_sentences_with_terms(dataset, technical_terms)

# Display the extracted sentences
print(extracted_sentences)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['When you connect to an external service, make sure the API returns the correct status codes.', 'Developers need to understand the differences between REST and GraphQL for effective API design.', 'He explained how API gateways handle incoming requests and direct them to microservices.', ' The API call failed due to incorrect authentication.', 'In an interview, they asked me to design a RESTful API from scratch.', 'Understanding API rate limits is crucial when building applications that rely on external services.', 'Training the neural network on a GPU using CUDA reduced the training time by 50%.', ' The algorithm runs faster when parallelized across multiple CUDA cores.', 'To speed up the model inference, we moved from CPU computation to CUDA enabled GPUs.', 'The API returns a JSON response that needs to be parsed before displaying the data on the frontend.'

Generate Speech using Bark

In [None]:
!pip install soundfile




In [4]:
!pip install git+https://github.com/suno-ai/bark.git
!pip install soundfile
import pandas as pd
import re
from google.colab import drive
from bark import generate_audio, SAMPLE_RATE
import soundfile as sf

# Mount Google Drive
drive.mount('/content/drive')

# Load the Excel file (update the file path if needed)
file_path = '/content/drive/MyDrive/DATASET FOR TTS/TTS-English.xlsx'
df = pd.read_excel(file_path)

# Extract the 'Text Example' column as a list
dataset = df['Text Example'].tolist()

# List of technical terms to look for in the dataset
technical_terms = ['API', 'CUDA', 'TensorFlow', 'PyTorch']

# Function to extract sentences with technical terms
def extract_sentences_with_terms(dataset, technical_terms):
    extracted_sentences = []
    for sentence in dataset:
        # Convert sentence to string if it's not already
        if not isinstance(sentence, str):
            sentence = str(sentence)
        for term in technical_terms:
            if re.search(r'\b' + term + r'\b', sentence):
                extracted_sentences.append(sentence)
                break
    return extracted_sentences

# Extract sentences containing technical terms
extracted_sentences = extract_sentences_with_terms(dataset, technical_terms)

# Display the extracted sentences
print(extracted_sentences)

# Generate and save audio for up to 5 sentences
for i, sentence in enumerate(extracted_sentences[:5]):  # Limit to 5 sentences
    audio = generate_audio(sentence) # Generate audio for the sentence using bark
    file_name = f"sentence_{i+1}.wav" # Create a file name for the audio
    sf.write(file_name, audio, SAMPLE_RATE) # Save the audio to a file using soundfile
    print(f"Generated and saved: {file_name}") # Print a confirmation message

Collecting git+https://github.com/suno-ai/bark.git
  Cloning https://github.com/suno-ai/bark.git to /tmp/pip-req-build-ztiiqq10
  Running command git clone --filter=blob:none --quiet https://github.com/suno-ai/bark.git /tmp/pip-req-build-ztiiqq10
  Resolved https://github.com/suno-ai/bark.git to commit f4f32d4cd480dfec1c245d258174bc9bde3c2148
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['When you connect to an external service, make sure the API returns the correct status codes.', 'Developers need to understand the differences between REST and GraphQL for effective API design.', 'He explained how API gateways handle incoming requests and direct them to microservices.', ' The API call failed due to incorrect authentication.', 'In an interview, they asked me to design a RESTful API from scratch.', 'Understanding API rate limits is crucial when building applications that rely on external services.', 'Training the neural network on a GPU using CUDA reduced the training time by 50%.', ' The algorithm runs faster when parallelized across multiple CUDA cores.', 'To speed up the model inference, we moved from CPU computation to CUDA enabled GPUs.', 'The API returns a JSON response that needs to be parsed before displaying the data on the frontend.'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

  checkpoint = torch.load(ckpt_path, map_location=device)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]



coarse_2.pt:   0%|          | 0.00/3.93G [00:00<?, ?B/s]

fine_2.pt:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)
Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [00:01<00:00, 86.6MB/s]
100%|██████████| 511/511 [01:36<00:00,  5.28it/s]
100%|██████████| 26/26 [08:51<00:00, 20.44s/it]


Generated and saved: sentence_1.wav


100%|██████████| 340/340 [01:04<00:00,  5.27it/s]
100%|██████████| 18/18 [05:40<00:00, 18.90s/it]


Generated and saved: sentence_2.wav


100%|██████████| 378/378 [01:09<00:00,  5.43it/s]
100%|██████████| 19/19 [06:23<00:00, 20.18s/it]


Generated and saved: sentence_3.wav


100%|██████████| 178/178 [00:33<00:00,  5.38it/s]
100%|██████████| 9/9 [02:26<00:00, 16.30s/it]


Generated and saved: sentence_4.wav


100%|██████████| 431/431 [01:18<00:00,  5.51it/s]
100%|██████████| 22/22 [07:23<00:00, 20.16s/it]


Generated and saved: sentence_5.wav


In [7]:
import pandas as pd
from google.colab import drive
import torchaudio
import IPython.display as ipd
import torch # Import torch explicitly

# Load the Excel file
file_path = '/content/drive/MyDrive/DATASET FOR TTS/TTS-English.xlsx'
df = pd.read_excel(file_path)

# Extract the 'Text Example' column as a list
dataset = df['Text Example'].tolist()

# Check if the 5th sentence exists in the dataset
sentence_index = 4  # Index for the 5th sentence (0-based indexing)
if len(dataset) > sentence_index:
    sentence = dataset[sentence_index]
    print(f"Selected sentence: {sentence}")

    # Load Bark and generate the audio
    from bark import generate_audio

    # Generate audio for the selected sentence
    audio_array = generate_audio(sentence)

    # Convert audio_array to a PyTorch tensor
    audio_array = torch.tensor(audio_array) # convert audio_array to tensor
    # If audio_array is 1D, add a dimension for channels
    if audio_array.dim() == 1:
        audio_array = audio_array.unsqueeze(0)  # Add channel dimension


    # Save the generated audio as a WAV file
    output_file = '/content/sentence_5.wav'
    torchaudio.save(output_file, audio_array, sample_rate=24000)

    # Play the generated audio
    ipd.display(ipd.Audio(output_file))

    print(f"Audio saved at: {output_file}")
else:
    print("The dataset doesn't have 5 sentences.")

Selected sentence: Integrating OAuth 2.0 with the existing system was a challenge, but it significantly improved security.


100%|██████████| 434/434 [01:25<00:00,  5.07it/s]
100%|██████████| 22/22 [07:20<00:00, 20.03s/it]


Audio saved at: /content/sentence_5.wav


**Evaluate Speech Quality**

Objective Metric: Mean Opinion Score (MOS)

In [8]:
# Example MOS collection
mos_scores = {
    'sentence_1.wav': [4, 5, 4],
    'sentence_2.wav': [3, 4, 4],
    # Add more scores for each audio sample
}

# Calculate the average MOS score for each audio
average_mos = {file: sum(scores)/len(scores) for file, scores in mos_scores.items()}
print(average_mos)

{'sentence_1.wav': 4.333333333333333, 'sentence_2.wav': 3.6666666666666665}
