**1. install the required libraries**

In [1]:
!pip install --quiet git+https://github.com/huggingface/transformers sentencepiece

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


**2. Load the Model**

In [2]:
# Import the required libraries
from transformers import SeamlessM4TModel
import torch

# Load the pre-trained SeamlessM4T model from the 🤗 Transformers Hub
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")

# Check if CUDA is available, if yes, set the device to "cuda:0", else use the CPU
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Move the model to the specified device (CUDA if available, otherwise CPU)
model = model.to(device)



config.json:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/5.35k [00:00<?, ?B/s]

**3. Load the Processor**

In [3]:
# Import the necessary library for loading the AutoProcessor
from transformers import AutoProcessor

# Load the pre-trained SeamlessM4T medium checkpoint using the AutoProcessor
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")

# Extracting the sample rate from the model's configuration
sample_rate = model.config.sampling_rate

preprocessor_config.json:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

**4. Speech-to-Text Translation**

In [4]:
import zipfile
import os

# Define the path to the uploaded zip file
zip_path = "audio.zip"

# Create a directory to extract the contents
extract_dir = "audio"

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Verify the extraction
extracted_files = os.listdir(extract_dir)
print(f"Extracted Files: {extracted_files}")


Extracted Files: ['OSR_us_000_0061_8k.wav']


In [5]:
import torchaudio

# Define the path to the extracted .wav file
audio_file_path = os.path.join(extract_dir, "OSR_us_000_0061_8k.wav")

# Load the audio file
audio_sample, audio_sampling_rate = torchaudio.load(audio_file_path)

# Check if the audio's sampling rate is different from the model's sampling rate and resample if necessary
if audio_sampling_rate != sample_rate:
    audio_sample = torchaudio.functional.resample(audio_sample,
                                                  orig_freq=audio_sampling_rate,
                                                  new_freq=sample_rate)

# Process the audio inputs using the specified processor, device, and sampling rate
audio_inputs = processor(audios=audio_sample, return_tensors="pt", sampling_rate=sample_rate).to(device)

# Generate text from the processed audio inputs, targeting French as the output language and disabling speech generation
output_tokens = model.generate(**audio_inputs, tgt_lang="eng", generate_speech=False)

# Decode the output tokens to obtain the translated text from the audio
translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)

# Print the translated text obtained from the audio
print(f"Translated Text: {translated_text_from_audio}")

Translated Text: The mute muffled the high tones of the horn. The old ring was covered with a hard fudge. The heap on the pierced stem was set on fire. The tin can was absent from store shelves.


Running the application locally takes much time because the model is heavy you can run the app locally with streamlit with the following code in cmd

In [None]:
streamlit run app.py

Here is the full code of app.py

In [None]:
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModel
import torchaudio

# Load the Seamless M4T model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/seamless-m4t-v2-large")
model = AutoModel.from_pretrained("facebook/seamless-m4t-v2-large")

# Create an ASR pipeline
asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/seamless-m4t-v2-large")

# Streamlit app
st.title("Seamless M4T ASR Streamlit App")

st.header("Upload an audio file for transcription")

# File uploader for audio files
audio_file = st.file_uploader("Choose an audio file", type=["wav", "mp3", "flac"])

if audio_file is not None:
    # Display audio player
    st.audio(audio_file, format='audio/wav')

    # Load the audio file
    waveform, sample_rate = torchaudio.load(audio_file)

    # Convert the waveform to the format expected by the model
    input_values = tokenizer(waveform, return_tensors="pt", sampling_rate=sample_rate).input_values

    # Perform ASR
    with st.spinner('Transcribing...'):
        transcription = asr_pipeline(audio_file.name)["text"]

    # Display the transcription
    st.subheader("Transcription")
    st.write(transcription)