Quantization with PyTorch

In [None]:
import torch
from transformers import SpeechT5ForTextToSpeech

# Load the fine-tuned SpeechT5 model
model = SpeechT5ForTextToSpeech.from_pretrained("path/to/fine-tuned-model")

# Apply dynamic quantization
quantized_model = torch.quantization.quantize_dynamic(
    model,  # Model to quantize
    {torch.nn.Linear},  # Specify which layers to quantize
    dtype=torch.qint8  # Use 8-bit quantization
)

# Save the quantized model
quantized_model_path = "quantized_speechT5_model.pth"
torch.save(quantized_model.state_dict(), quantized_model_path)

print("Quantization completed. Model saved at:", quantized_model_path)


Pruning to Improve Inference Speed

In [1]:
import torch.nn.utils.prune as prune

# Prune 30% of the weights in linear layers
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.l1_unstructured(module, name='weight', amount=0.3)

# Save the pruned model
pruned_model_path = "pruned_speechT5_model.pth"
torch.save(model.state_dict(), pruned_model_path)

print("Pruning completed. Model saved at:", pruned_model_path)


NameError: name 'model' is not defined

Inference Speed Testing on CPU/GPU and Edge Devices

In [None]:
import time

# Load the quantized model
model.load_state_dict(torch.load(quantized_model_path))

# Prepare input text and move model to the appropriate device
text = "Using CUDA and APIs efficiently improves GPU performance."
inputs = processor(text=text, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Measure inference time
start_time = time.time()
with torch.no_grad():
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
end_time = time.time()

print(f"Inference time: {end_time - start_time:.4f} seconds")


Evaluation - Model Size and Quality Trade-off

In [None]:
import os

# Check the size of each model
original_size = os.path.getsize("path/to/fine-tuned-model.pth")
quantized_size = os.path.getsize(quantized_model_path)
pruned_size = os.path.getsize(pruned_model_path)

print(f"Original Model Size: {original_size / 1e6:.2f} MB")
print(f"Quantized Model Size: {quantized_size / 1e6:.2f} MB")
print(f"Pruned Model Size: {pruned_size / 1e6:.2f} MB")

# Sample MOS evaluation output
print("MOS Scores (out of 5):")
print("Original Model: 4.2")
print("Quantized Model: 4.0")
print("Pruned Model: 3.9")
