In [22]:
!pip install -q transformers gradio datasets librosa sentencepiece aksharamukha OmegaConf

In [24]:
!sudo apt-get install libportaudio2

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  libportaudio2
0 upgraded, 1 newly installed, 0 to remove and 15 not upgraded.
Need to get 65.3 kB of archives.
After this operation, 223 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudio2 amd64 19.6.0-1.1 [65.3 kB]
Fetched 65.3 kB in 1s (56.0 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package libportaudio2:amd64.
(Reading database ... 120500 fi

### 1. ASR Model (English Speech to Text)
> Link: https://huggingface.co/openai/whisper-small ~1GB <br>
> WER: 11.3%

In [40]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

In [42]:
asr_processor = WhisperProcessor.from_pretrained("openai/whisper-small")
asr_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
asr_model.config.forced_decoder_ids = None

In [43]:
wav_file = "/content/hello_english.wav"
sample_rate = 16000

In [44]:
import librosa
audio_input, sample_rate = librosa.load(wav_file, sr=sample_rate)

In [45]:
input_features = asr_processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_features
predicted_ids = asr_model.generate(input_features)
transcription = asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)



In [46]:
print(transcription)

[' Hello, my name is Byshal Sergol.']


### 2. Machine Translation (English to Hindi)
> Link: https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt ~2.5GB <br>
> BLEU Score: 35.8


In [47]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

In [48]:
mt_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
mt_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")

In [49]:
article_en = "The head of the United Nations says there is no military solution in Syria"

In [50]:
model_inputs = mt_tokenizer(article_en, return_tensors="pt")

In [51]:
generated_tokens = mt_model.generate(
    **model_inputs,
    forced_bos_token_id=mt_tokenizer.lang_code_to_id["hi_IN"]
)



In [52]:
generated_text = mt_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [53]:
print(generated_text)

['संयुक्त राष्ट्र के नेता कहते हैं कि सीरिया में कोई सैन्य समाधान नहीं है']


### 3. TTS Model (Hindi Text to Speech)
> Link: https://aksharamukha.appspot.com/python ~55MB <br>
> MOS Score: 4.03

In [10]:
import torch
from aksharamukha import transliterate

In [54]:
tts_model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                     model='silero_tts',
                                     language='indic',
                                     speaker='v3_indic')

Using cache found in /root/.cache/torch/hub/snakers4_silero-models_master


In [55]:
orig_text = "प्रसिद्द कबीर अध्येता, पुरुषोत्तम अग्रवाल का यह शोध आलेख, उस रामानंद की खोज करता है"

In [56]:
roman_text = transliterate.process('Devanagari', 'ISO', orig_text)

In [57]:
print(roman_text)

prasidda kabīra adhyētā, puruṣōttama agravāla kā yaha śōdha ālēkha, usa rāmānaṁda kī khōja karatā hai


In [58]:
audio = tts_model.apply_tts(roman_text,
                        speaker='hindi_male')

In [59]:
import tensorflow as tf
import numpy as np
import soundfile as sf
sf.write("audio.wav", tf.cast(audio, np.float32), samplerate=48000)

In [60]:
import IPython
IPython.display.Audio("audio.wav")

## Complete Pipeline

In [61]:
def translate(wav_file):
  # 1. ASR Model (English)
  sample_rate = 16000
  audio_input, sample_rate = librosa.load(wav_file, sr=sample_rate)
  input_features = asr_processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_features
  predicted_ids = asr_model.generate(input_features)
  transcription = asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)
  english_text = transcription[0]
  # 2. Machine Translation (English -> Hindi)
  model_inputs = mt_tokenizer(english_text, return_tensors="pt")
  generated_tokens = mt_model.generate(**model_inputs, forced_bos_token_id=mt_tokenizer.lang_code_to_id["hi_IN"])
  generated_text = mt_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
  hindi_text = generated_text[0]
  # 3. TTS Model (Hindi)
  roman_text = transliterate.process('Devanagari', 'ISO', hindi_text)
  audio = tts_model.apply_tts(roman_text, speaker='hindi_male')
  sf.write("audio.wav", tf.cast(audio, np.float32), samplerate=48000)
  return audio

In [62]:
translate("/content/hello_english.wav")



tensor([ 0.0015,  0.0015,  0.0008,  ..., -0.0129, -0.0387,  0.0303])

In [63]:
import IPython
IPython.display.Audio("audio.wav")