In [1]:
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf
from IPython.display import Audio, display
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from sklearn.metrics import accuracy_score
import numpy as np



In [2]:
# Load pre-trained models
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [3]:
# Load xvector containing speaker's voice characteristics
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [4]:
# Synthesize speech
text = "Hello, this is a test of text-to-speech synthesis using a pre-trained model."
inputs = processor(text=text, return_tensors="pt")

In [5]:
# Generate speech
speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)

In [6]:
# Save the audio
sf.write("output.wav", speech.numpy(), samplerate=16000)

In [7]:
# Display the audio
display(Audio("output.wav", autoplay=True))
print("Audio saved as 'output.wav' and should be playing now.")

Audio saved as 'output.wav' and should be playing now.


In [8]:
# Step 2: Load ASR Model for Transcription
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [9]:
# Transcribe the generated audio
audio_input, _ = sf.read("output.wav")
inputs = asr_processor(audio_input, return_tensors="pt", padding="longest")
with torch.no_grad():
    logits = asr_model(inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
2024-10-15 15:41:11.261031: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-15 15:41:11.275889: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-15 15:41:11.280237: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-15 15:41:11.291964: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the follow

In [10]:
# Decode the predicted ids to text
transcribed_text = asr_processor.batch_decode(predicted_ids)[0]
print(f"Transcribed Text: {transcribed_text}")

Transcribed Text: HELLO THIS IS A TEST OF TEXT TO SPEECH SYNTHESIS USING A PRETRAINED MODEL


In [11]:
# Step 3: Calculate Accuracy
# Lowercase both original and transcribed texts for case-insensitive comparison
original_words = text.lower().split()
transcribed_words = transcribed_text.lower().split()

In [12]:
# Calculate accuracy as the ratio of correct words
correct_words = sum(1 for word in transcribed_words if word in original_words)
accuracy = correct_words / len(original_words) * 100 if original_words else 0


In [13]:
print(f"Original Text: '{text}'")
print(f"Transcribed Text: '{transcribed_text}'")
print(f"Accuracy: {accuracy:.2f}%")

Original Text: 'Hello, this is a test of text-to-speech synthesis using a pre-trained model.'
Transcribed Text: 'HELLO THIS IS A TEST OF TEXT TO SPEECH SYNTHESIS USING A PRETRAINED MODEL'
Accuracy: 66.67%
