In [1]:
# Note, to use this code, please follow the instuctions in 
# https://tts.readthedocs.io/en/latest/installation.html to install the TTS package

import os
import sys
# pylint: disable=redefined-outer-name, unused-argument
from pathlib import Path
import TTS
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer
import tkinter as tk
from tkinter import filedialog
from os import path
import pyaudio
import sys
import speech_recognition as sr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM # Machine Translation
import librosa
from IPython.display import Audio
from transformers import MarianTokenizer, MarianMTModel

 

In [2]:
def speechRecognition(AUDIO_FILE = 'untitled.wav',src = 'en',event = None):
    r = sr.Recognizer()
    with sr.AudioFile(AUDIO_FILE) as source:
        audio = r.listen(source)
        output = r.recognize_google(audio)
        return output

In [3]:
# Machine Translation
def machineTranslation(output, src = "en", trg = "ja"):
    try:
        model_name = f"Helsinki-NLP/opus-tatoeba-{src}-{trg}" # en-ja model
        tokenizer = MarianTokenizer.from_pretrained(model_name)
    except:
        model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}" 
        tokenizer = MarianTokenizer.from_pretrained(model_name)

    model = MarianMTModel.from_pretrained(model_name)
    
    batch = tokenizer([output], return_tensors="pt")
    generated_ids = model.generate(**batch)
    return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

In [4]:
# Create Synth

model = 1

if model == 1:
    model_path = 'GermanModel/best_model_96618.pth'
    config_path = 'GermanModel/config.json'
    speakers_file_path = 'GermanModel/speakers.json'
    language_ids_file_path = 'GermanModel/language_ids.json'
else:
    model_path = 'checkpoint_20000.pth'
    config_path = 'config.json'
    speakers_file_path = 'speakers.json'
    language_ids_file_path = 'language_ids.json'
    

vocoder_path = None
vocoder_config_path = None
encoder_path = 'model_se.pth.tar'
encoder_config_path = 'config_se.json'
use_cuda = False

synth = Synthesizer(
        model_path,
        config_path,
        speakers_file_path,
        language_ids_file_path,
        vocoder_path,
        vocoder_config_path,
        encoder_path,
        encoder_config_path,
        use_cuda,
    )

 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_siz

In [5]:
# Transliterator 

import pykakasi
kks = pykakasi.kakasi()

def transliterate(kks,item):
    return " ".join([item['hepburn'] for item in kks.convert(item)])



In [6]:
def speechtranslate(wav = 'untitled.wav',synth = synth,src = 'en',trg = 'ja', text = None,refwav = None):
    if text is None:
        text = speechRecognition(wav,src)
    print(text)

    if src != trg:
        text = machineTranslation(text,src,trg)

    print(text)
    if trg == 'ja': # transliterate japanese
        kks = pykakasi.kakasi()
        text = transliterate(kks,text)
    if trg == 'fr':
        trg = trg + '-' + trg
    if trg == 'de':
        trg = trg + '_' + trg
    if refwav is None:
        return synth.tts(text=text,speaker_wav=wav,language_name=trg)
    else:
        return synth.tts(text=text,speaker_wav=refwav,language_name=trg)
    
    
    

In [8]:
audio = speechtranslate(wav = 'Gasby.wav',src = 'en',trg = 'ja', text = None ,refwav = None)

 So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her. 
それで,彼女は自分の心で(また出来るほどに)考へた。 暑い日には、とても眠くて愚かな感じがした。 デージー・チャインを作る楽しみは、起き上ってダサイを拾うことに苦労する価値があるかどうか、と。 突然、ピンクの眼を持つ白のウサギが彼女の近くに走って来た。
 > Text splitted to sentences.
['sorede, kanojo ha jibun no kokoro de ( mata dekiru hodoni) kangahe ta.', 'atsui nichi niha, totemo nemuku te oroka na kanji gashita.', 'deejii・chain wo tsukuru tanoshimi ha, okiatsu te dasai wo hirou kotoni kurou suru kachi gaarukadouka, to.', 'totsuzen, pinku no me wo motsu shiro no usagi ga kanojo no chikaku ni hashitsu te kita.']
['<BLNK>', 'd', '<BLNK>', 'e', '<BLNK>', 'e', '<BLNK>', 'j', '<BLNK>', 'i', '<BLNK>', 'i', '<BLNK>', '・', '<BLNK>', 'c', '<BLNK>', 'h', '<BLNK>', 'a', '<BLNK>', 'i', '<BLNK>', 'n', '<BLNK>', ' ', '<BLNK>', 'w', '<

In [73]:
synth.save_wav(audio,'frtest.wav')

 > Text splitted to sentences.
['soreha nazo no kotae da']




 > Processing time: 2.9315192699432373
 > Real-time factor: 1.0773683461753905
