In [None]:
from transformers import SpeechT5ForSpeechToText, SpeechT5Processor

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

inputs = processor(
    audio=array, sampling_rate=sampling_rate, return_tensors="pt"
)
with torch.no_grad():
    predicted_ids = model.generate(**inputs, max_new_tokens=200)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

['chapter sixteen i might have told you of the beginning i might have told you of the beginning of the beginning of the beginning of the beginning of the beginning chapter sixteen']

In [None]:
from transformers import SpeechT5ForTextToSpeech

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

inputs = processor(text="There are llamas all around.", return_tensors="pt")
embeddings_dataset = load_dataset(
    "Matthijs/cmu-arctic-xvectors", split="validation"
)
speaker_embeddings = torch.tensor(
    embeddings_dataset[7440]["xvector"]
).unsqueeze(0)


with torch.no_grad():
    spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
spectrogram

tensor([[-3.6003, -3.6643, -3.7114,  ..., -4.5534, -4.5948, -4.7614],
        [-3.3392, -3.4059, -3.4096,  ..., -4.4603, -4.4640, -4.7025],
        [-2.9305, -3.0289, -3.0103,  ..., -4.1975, -4.2770, -4.5122],
        ...,
        [-3.2018, -3.4044, -3.5426,  ..., -4.5240, -4.5532, -4.7528],
        [-3.2666, -3.4596, -3.5791,  ..., -4.5361, -4.5633, -4.7589],
        [-3.2899, -3.4900, -3.6032,  ..., -4.5457, -4.5672, -4.7564]])

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Convert spectrogram tensor to numpy array
spectrogram_np = spectrogram.cpu().numpy()

# Display the spectrogram using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(spectrogram_np[0], aspect='auto', origin='lower', cmap='inferno')
plt.colorbar(label='Intensity')
plt.title('Spectrogram')
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.show()

![alt text](spectogram.png)

In [None]:
from transformers import SpeechT5HifiGan

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
with torch.no_grad():
    # Alternatively
    # model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    speech = vocoder(spectrogram)
speech

tensor([-4.7037e-05,  2.6946e-05,  1.9352e-05,  ..., -1.9595e-04,
        -1.6578e-04, -2.3303e-04])

In [None]:
import numpy as np
import soundfile as sf

# Convert the tensor to a NumPy array
speech_numpy = speech.squeeze().cpu().numpy()

# Save the NumPy array as a WAV file
sf.write("output.wav", speech_numpy, 16000)

# Optionally, play the audio (requires IPython)
from IPython.display import Audio
Audio("output.wav")

In [None]:
from transformers import VitsModel, VitsTokenizer, set_seed

tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
model = VitsModel.from_pretrained("facebook/mms-tts-eng")

inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

set_seed(555)  # make deterministic
with torch.no_grad():
    outputs = model(inputs["input_ids"])

outputs.waveform[0]

In [None]:
# Extract the waveform tensor and move it to CPU
waveform_tensor = outputs.waveform[0].cpu()

# Convert the tensor to a NumPy array
waveform_numpy = waveform_tensor.numpy()

# Save the NumPy array as a WAV file
sf.write("mms_output.wav", waveform_numpy, 16000)  # Assuming the sample rate is 22050

# Optionally, play the audio (requires IPython)
from IPython.display import Audio
Audio("mms_output.wav")

In [None]:
from transformers import AutoModel, AutoProcessor

processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small").to(device)

inputs = processor(
    text=[
        "Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe."
    ],
    return_tensors="pt",
).to(device)


speech_values = model.generate(**inputs, do_sample=True)

In [None]:
# Convert the speech values tensor to a NumPy array
speech_numpy = speech_values.squeeze().cpu().numpy()

# Save the NumPy array as a WAV file
sf.write("suno_output.wav", speech_numpy, 16000)  # Adjust the sample rate if necessary

# Optionally, play the audio (requires IPython)
from IPython.display import Audio
Audio("suno_output.wav")

In [None]:
voice_preset = "v2/en_speaker_6"

inputs = processor("Hello, my dog is cute", voice_preset=voice_preset).to(
    device
)

audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()

# Save the NumPy array as a WAV file
sf.write("6_speaker_output.wav", audio_array, 16000)  # Adjust the sample rate if necessary

# Optionally, play the audio (requires IPython)
from IPython.display import Audio
Audio("6_speaker_output.wav")

In [None]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration

model = MusicgenForConditionalGeneration.from_pretrained(
    "facebook/musicgen-small"
).to(device)
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
inputs = processor(
    text=["an intense rock guitar solo"],
    padding=True,
    return_tensors="pt",
).to(device)

audio_values = model.generate(
    **inputs, do_sample=False, guidance_scale=3, max_new_tokens=256
)


In [None]:
# Convert the audio values tensor to a NumPy array
audio_numpy = audio_values.squeeze().cpu().numpy()

# Save the NumPy array as a WAV file
sf.write("musicgen_output.wav", audio_numpy, 8000)  # Adjust the sample rate if necessary

# Optionally, play the audio (requires IPython)
from IPython.display import Audio
Audio("musicgen_output.wav")

In [None]:
pip install --upgrade transformers torch

In [None]:
import torch
from transformers import pipeline

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
int_device = 0 if torch.cuda.is_available() else -1  # 0 for GPU, -1 for CPU

# Use pipeline with adjusted parameters
try:
    pipe = pipeline("text-to-audio", model="facebook/musicgen-small", device=int_device)

    # Adjust generation parameters
    data = pipe(
        "electric rock solo, very intense",
    )

    # Output the generated audio data
    print(data)
except Exception as e:
    print(f"Error during pipeline execution: {e}")


In [None]:
import torch
from transformers import AutoProcessor, MusicgenForConditionalGeneration

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model and processor manually to check for any issues
try:
    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").to(device)
    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
except Exception as e:
    print(f"Error loading model or processor: {e}")

# Prepare inputs
try:
    inputs = processor(
        text=["electric rock solo, very intense"],
        padding=True,
        return_tensors="pt"
    ).to(device)
except Exception as e:
    print(f"Error preparing inputs: {e}")

# Function to validate tensors
def validate_tensors(inputs):
    for key, tensor in inputs.items():
        if torch.isnan(tensor).any() or torch.isinf(tensor).any() or (tensor < 0).any():
            raise ValueError(f"Invalid values found in tensor: {key}")

# Generate audio
try:
    validate_tensors(inputs)
    audio_values = model.generate(
        **inputs,
        do_sample=True,
        guidance_scale=3,
        max_new_tokens=256
    )
    print(audio_values)
except Exception as e:
    print(f"Error during audio generation: {e}")