In [49]:
# Note, to use this code, please follow the instuctions in 
# https://tts.readthedocs.io/en/latest/installation.html to install the TTS package

import os
import sys
# pylint: disable=redefined-outer-name, unused-argument
from pathlib import Path
import TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

import pyaudio
import sys
import speech_recognition as sr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Machine Translation
import librosa
from IPython.display import Audio
from transformers import MarianTokenizer, MarianMTModel

 

In [50]:
# Speech Recognition
def speechRecognition():
    r = sr.Recognizer()
    with sr.Microphone() as source:
        print("Please say something")
        audio = r.listen(source)
        with open("microphone-results.wav", "wb") as f:
            f.write(audio.get_wav_data())
        output = r.recognize_google(audio)
        print(f"Are you trying to say: \"{output}\"?")
        return output

In [51]:
# Machine Translation
def machineTranslation(output, src = "en", trg = "ja"):
    try:
        model_name = f"Helsinki-NLP/opus-tatoeba-{src}-{trg}" # en-ja model
        tokenizer = MarianTokenizer.from_pretrained(model_name)
    except:
        model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}" 
        tokenizer = MarianTokenizer.from_pretrained(model_name)

    model = MarianMTModel.from_pretrained(model_name)
    
    batch = tokenizer([output], return_tensors="pt")
    generated_ids = model.generate(**batch)
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [52]:
# Create Synth

model_path = 'checkpoint_20000.pth'
config_path = 'config.json'
speakers_file_path = 'speakers.json'
language_ids_file_path = 'language_ids.json'
vocoder_path = None
vocoder_config_path = None
encoder_path = 'model_se.pth.tar'
encoder_config_path = 'config_se.json'
use_cuda = False

synth = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        language_ids_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        use_cuda,
    )

 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_siz

In [53]:
# Transliterator 

import pykakasi
kks = pykakasi.kakasi()

def transliterate(kks,item):
    return " ".join([item['hepburn'] for item in kks.convert(item)])



In [54]:
def speechtranslate(wav = '../TomSawyerTest.wav',synth = synth,src = 'en',trg = 'ja', text = "Hello guy"):
    if text is None:
        text = speechRecognition(wav,src)
    text = machineTranslation(text,src,trg)
    print(text)
    if trg == 'ja':
        kks = pykakasi.kakasi()
        text = transliterate(kks,text)
    return synth.tts(text=text,speaker_wav=wav,language_name=trg)
    
    
    
    

In [57]:
audio = speechtranslate(wav = '../TomSawyerTest.wav',src = 'en',trg = 'ja', text = "This is an amazing walk, what!")

これは素晴らしい散歩だ!
 > Text splitted to sentences.
['koreha subarashii sanpo da!']




 > Processing time: 2.5640065670013428
 > Real-time factor: 0.9101904746188649


In [58]:
Audio(audio,rate = 16000)


In [37]:
translation

'ハイ、元気?'