In [1]:
hf_token = "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"

In [2]:
import os
import torch
import torchaudio
from transformers import (
    SeamlessM4TFeatureExtractor,
    SeamlessM4TTokenizer,
    SeamlessM4Tv2ForSpeechToText,
)

from lang_list import (
    LANGUAGE_NAME_TO_CODE,
)

# Set up device
device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32



In [3]:
# Load model and processor
hf_token = "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"

model = SeamlessM4Tv2ForSpeechToText.from_pretrained("ai4bharat/indic-seamless", torch_dtype=torch_dtype, token=hf_token).to(device)
processor = SeamlessM4TFeatureExtractor.from_pretrained("ai4bharat/indic-seamless", token=hf_token)
tokenizer = SeamlessM4TTokenizer.from_pretrained("ai4bharat/indic-seamless", token=hf_token)

AUDIO_SAMPLE_RATE = 20000
MAX_INPUT_AUDIO_LENGTH = 60  # seconds



config.json:   0%|          | 0.00/2.76k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/139k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/9.91M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.7k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.17M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

In [None]:
def run_s2tt(input_audio_path: str, source_language: str, target_language: str) -> str:
    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]

    # Load and resample audio
    audio_waveform, orig_freq = torchaudio.load(input_audio_path)
    audio_waveform = torchaudio.functional.resample(audio_waveform, orig_freq=orig_freq, new_freq=AUDIO_SAMPLE_RATE)

    # Truncate if necessary
    max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
    if audio_waveform.shape[1] > max_length:
        audio_waveform = audio_waveform[:, :max_length]
        print(f"⚠️ Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")

    # Prepare input
    audio_inputs = processor(audio_waveform, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device=device, dtype=torch_dtype)

    # Generate and decode output
    generated_ids = model.generate(**audio_inputs, tgt_lang=target_language_code)[0].float().cpu().numpy().squeeze()
    translated_text = tokenizer.decode(generated_ids, clean_up_tokenization_spaces=True, skip_special_tokens=True)

    return translated_text


if __name__ == "__main__":
    # Example usage
    audio_file = "/content/Punjabi.wav"
    source_lang = "Punjabi"
    target_lang = "Tamil"

    output = run_s2tt(audio_file, source_lang, target_lang)
    print("📝 Translated Text:", output)

📝 Translated Text: சோஃபி மற்றும் விதா ஆகியோர் தங்கள் நாட்டைப் பாதுகாக்கும் பணியை சிறப்பாகச் செய்ததால் இந்த மரியாதைக்குரிய விருதைப் பெற்றனர்


In [4]:
save_directory = "./indic-seamless-s2tt-local"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
processor.save_pretrained(save_directory)

['./indic-seamless-s2tt-local/preprocessor_config.json']

In [None]:
import io
import os
import numpy as np
import torch
import nltk
from pydub import AudioSegment
from transformers import AutoTokenizer, AutoFeatureExtractor
from parler_tts import ParlerTTSForConditionalGeneration

# Download required NLTK data
nltk.download('punkt_tab')

# Device and dtype config
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32

# Load base and finetuned models
repo_id = "ai4bharat/indic-parler-tts-pretrained"
finetuned_repo_id = "ai4bharat/indic-parler-tts"

model = ParlerTTSForConditionalGeneration.from_pretrained(
    repo_id, attn_implementation="eager", torch_dtype=torch_dtype
).to(device)

tokenizer = AutoTokenizer.from_pretrained(repo_id)
description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)

sampling_rate = feature_extractor.sampling_rate


# Helper: Convert numpy audio to MP3
def numpy_to_mp3(audio_array, sampling_rate):
    if np.issubdtype(audio_array.dtype, np.floating):
        max_val = np.max(np.abs(audio_array))
        if max_val > 0:
            audio_array = (audio_array / max_val) * 32767
        audio_array = audio_array.astype(np.int16)

    audio_segment = AudioSegment(
        audio_array.tobytes(),
        frame_rate=sampling_rate,
        sample_width=audio_array.dtype.itemsize,
        channels=1
    )

    mp3_io = io.BytesIO()
    audio_segment.export(mp3_io, format="mp3", bitrate="320k")
    mp3_bytes = mp3_io.getvalue()
    mp3_io.close()

    return mp3_bytes


# Main function: Generate audio
def generate_audio(text: str, description: str):
    chunk_size = 25
    inputs = description_tokenizer(description, return_tensors="pt").to(device)
    sentences = nltk.sent_tokenize(text)

    current_chunk = ""
    chunks = []

    for sentence in sentences:
        candidate = (current_chunk + " " + sentence).strip()
        if len(candidate.split()) >= chunk_size:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk = candidate

    if current_chunk:
        chunks.append(current_chunk.strip())

    all_audio = []
    for chunk in chunks:
        prompt = tokenizer(chunk, return_tensors="pt").to(device)

        output = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            prompt_input_ids=prompt.input_ids,
            prompt_attention_mask=prompt.attention_mask,
            do_sample=True,
            return_dict_in_generate=True
        )

        if hasattr(output, 'sequences') and hasattr(output, 'audios_length'):
            audio = output.sequences[0, :output.audios_length[0]]
            audio_np = audio.to(torch.float32).cpu().numpy().squeeze()
            if len(audio_np.shape) > 1:
                audio_np = audio_np.flatten()
            all_audio.append(audio_np)

    final_audio = np.concatenate(all_audio)
    print(f"[INFO] Final audio length: {round(final_audio.shape[0] / sampling_rate, 2)} seconds")
    return numpy_to_mp3(final_audio, sampling_rate)


# Sample usage
if __name__ == "__main__":
    sample_text = "சோஃபி மற்றும் விதா ஆகியோர் தங்கள் நாட்டைப் பாதுகாக்கும் பணியை சிறப்பாகச் செய்ததால் இந்த மரியாதைக்குரிய விருதைப் பெற்றனர்"
    sample_description = "A male narrator speaking in an enthusiastic tone."

    print("[INFO] Generating audio...")
    mp3_audio = generate_audio(sample_text, sample_description)

    output_path = "output.mp3"
    with open(output_path, "wb") as f:
        f.write(mp3_audio)

    print(f"[SUCCESS] Audio saved to {output_path}")


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

  "_name_or_path": "ylacombe/dac_44khz",
  "architectures": [
    "DacModel"
  ],
  "codebook_

[INFO] Generating audio...
[INFO] Final audio length: 7.53 seconds
[SUCCESS] Audio saved to output.mp3


In [5]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [6]:
!pip install git+https://github.com/huggingface/parler-tts.git


Collecting git+https://github.com/huggingface/parler-tts.git
  Cloning https://github.com/huggingface/parler-tts.git to /tmp/pip-req-build-vk5_yfql
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/parler-tts.git /tmp/pip-req-build-vk5_yfql
  Resolved https://github.com/huggingface/parler-tts.git to commit d108732cd57788ec86bc857d99a6cabd66663d68
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting descript-audiotools@ git+https://github.com/descriptinc/audiotools (from parler_tts==0.2.2)
  Cloning https://github.com/descriptinc/audiotools to /tmp/pip-install-m5zhlnu9/descript-audiotools_da5f075ba2b84b3f9959946815a1b04b
  Running command git clone --filter=blob:none --quiet https://github.com/descriptinc/audiotools /tmp/pip-install-m5zhlnu9/descript-audiotools_da5f075ba2b84b3f9959946815a1b04b
  Resolved https://github.com/d

In [3]:
import os
import io
import torch
import torchaudio
import nltk
import numpy as np
from pydub import AudioSegment
from transformers import (
    SeamlessM4TFeatureExtractor,
    SeamlessM4TTokenizer,
    SeamlessM4Tv2ForSpeechToText,
    AutoTokenizer,
    AutoFeatureExtractor
)
from parler_tts import ParlerTTSForConditionalGeneration
from lang_list import LANGUAGE_NAME_TO_CODE

# Download NLTK data
nltk.download("punkt_tab")

# Device and data type config
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
torch_dtype = torch.bfloat16 if device != "cpu" else torch.float32

# Constants
AUDIO_SAMPLE_RATE = 16000
MAX_INPUT_AUDIO_LENGTH = 60  # seconds
hf_token = "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"

# Load SeamlessM4T model and processor
s2t_model = SeamlessM4Tv2ForSpeechToText.from_pretrained("ai4bharat/indic-seamless", torch_dtype=torch_dtype, token=hf_token, local_files_only=True).to(device)
s2t_processor = SeamlessM4TFeatureExtractor.from_pretrained("ai4bharat/indic-seamless", token=hf_token, local_files_only=True)
s2t_tokenizer = SeamlessM4TTokenizer.from_pretrained("ai4bharat/indic-seamless", token=hf_token, local_files_only = True)

# Load Parler-TTS
tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
    "ai4bharat/indic-parler-tts-pretrained", attn_implementation="eager", torch_dtype=torch_dtype
).to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts-pretrained")
desc_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
feature_extractor = AutoFeatureExtractor.from_pretrained("ai4bharat/indic-parler-tts-pretrained")
sampling_rate = feature_extractor.sampling_rate


# 1️⃣ Speech-to-Text with SeamlessM4T
def run_speech_to_text(input_audio_path: str, target_language: str) -> str:
    target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
    audio_waveform, orig_freq = torchaudio.load(input_audio_path)
    audio_waveform = torchaudio.functional.resample(audio_waveform, orig_freq=orig_freq, new_freq=AUDIO_SAMPLE_RATE)

    # Truncate
    max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
    if audio_waveform.shape[1] > max_length:
        audio_waveform = audio_waveform[:, :max_length]
        print(f"⚠️ Input audio too long. Truncated to {MAX_INPUT_AUDIO_LENGTH} seconds.")

    inputs = s2t_processor(audio_waveform, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt").to(device=device, dtype=torch_dtype)
    generated_ids = s2t_model.generate(**inputs, tgt_lang=target_language_code)[0].float().cpu().numpy().squeeze()
    translated_text = s2t_tokenizer.decode(generated_ids, clean_up_tokenization_spaces=True, skip_special_tokens=True)
    return translated_text


# 2️⃣ Text-to-Speech with Parler-TTS
def generate_audio(text: str, description: str = "A female doctor speaking normally in a soft neutral tone."):
    chunk_size = 25
    desc_inputs = desc_tokenizer(description, return_tensors="pt").to(device)
    sentences = nltk.sent_tokenize(text)

    # Chunking
    current_chunk = ""
    chunks = []
    for sentence in sentences:
        candidate = (current_chunk + " " + sentence).strip()
        if len(candidate.split()) >= chunk_size:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
        else:
            current_chunk = candidate
    if current_chunk:
        chunks.append(current_chunk.strip())

    # Audio synthesis
    all_audio = []
    for chunk in chunks:
        prompt = tts_tokenizer(chunk, return_tensors="pt").to(device)
        output = tts_model.generate(
            input_ids=desc_inputs.input_ids,
            attention_mask=desc_inputs.attention_mask,
            prompt_input_ids=prompt.input_ids,
            prompt_attention_mask=prompt.attention_mask,
            do_sample=True,
            return_dict_in_generate=True
        )
        if hasattr(output, 'sequences') and hasattr(output, 'audios_length'):
            audio = output.sequences[0, :output.audios_length[0]]
            audio_np = audio.to(torch.float32).cpu().numpy().squeeze()
            all_audio.append(audio_np.flatten())

    final_audio = np.concatenate(all_audio)
    return numpy_to_mp3(final_audio, sampling_rate)


# Helper: Convert numpy audio to MP3
def numpy_to_mp3(audio_array, sampling_rate):
    if np.issubdtype(audio_array.dtype, np.floating):
        max_val = np.max(np.abs(audio_array))
        if max_val > 0:
            audio_array = (audio_array / max_val) * 32767
        audio_array = audio_array.astype(np.int16)

    audio_segment = AudioSegment(
        audio_array.tobytes(),
        frame_rate=sampling_rate,
        sample_width=audio_array.dtype.itemsize,
        channels=1
    )
    mp3_io = io.BytesIO()
    audio_segment.export(mp3_io, format="mp3", bitrate="320k")
    return mp3_io.getvalue()


# 🔁 Full Pipeline: Speech ➝ Text ➝ Translation ➝ Speech
def speech_to_speech(input_audio_path, source_lang, target_lang, output_mp3_path="translatednorm_output.mp3"):
    print("🔊 Transcribing and translating...")
    translated_text = run_speech_to_text(input_audio_path, target_lang)
    print(f"📝 Translated Text: {translated_text}")

    print("🎙 Generating audio in target language...")
    mp3_audio = generate_audio(translated_text)

    with open(output_mp3_path, "wb") as f:
        f.write(mp3_audio)
    print(f"✅ Audio saved to {output_mp3_path}")


# Example usage
if __name__ == "__main__":
    input_audio = "/content/harvard.wav"  # Source language audio
    source_language = "English"
    target_language = "Tamil"

    speech_to_speech(input_audio, source_language, target_language)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

  "_name_or_path": "ylacombe/dac_44khz",
  "architectures": [
    "DacModel"
  ],
  "codebook_dim": 8,
  "codebook_loss_weight": 1.0,
  "codebook_size": 1024,
  "commitment_loss_weight": 0.25,
  "decoder_hidden_si

🔊 Transcribing and translating...
📝 Translated Text: பழைய பீர் நொறுங்கிய வாசனை நீடிக்கும் அது வாசனையை வெளிப்படுத்த வெப்பத்தை எடுக்கிறது ஒரு குளிர்ந்த நீராட்டம் ஆரோக்கியத்தையும் உற்சாகத்தையும் மீட்டெடுக்கிறது ஒரு உப்பு ஊறுகாய் ஹாம் உடன் நன்றாக சுவைக்கிறது டகோஸ் ஆல் பாஸ்டோர் எனக்கு மிகவும் பிடித்தமானது ஒரு உற்சாகமான உணவு சூடான குறுக்கு ரொட்டி
🎙 Generating audio in target language...




✅ Audio saved to translatednorm_output.mp3


In [None]:
save_directory = "./indic-seamless-s2tt-local"
tts_model.save_pretrained(save_directory)
tts_tokenizer.save_pretrained(save_directory)
desc_tokenizer.save_pretrained(save_directory)
feature_extractor.save_pretrained(save_directory)