# Jarvis API Example

This notebook only shows how to use Jarvis API on ASR and TTS. Find other examples and more details at  https://docs.nvidia.com/deeplearning/jarvis/user-guide/docs/

## Install Jarvis API

You can get Jarvis API Python wheel at https://ngc.nvidia.com/catalog/collections/nvidia:jarvis

In [1]:
!pip install jarvis_api-1.1.0b0-py3-none-any.whl

/bin/bash: /usr/local/bin/pip: /usr/local/opt/python/bin/python3.7: bad interpreter: No such file or directory


In [5]:
import io
import librosa
import time
import numpy as np
import IPython.display as ipd
import grpc
import requests

# ASR proto
import jarvis_api.jarvis_asr_pb2 as jasr
import jarvis_api.jarvis_asr_pb2_grpc as jasr_srv

# TTS proto
import jarvis_api.jarvis_tts_pb2 as jtts
import jarvis_api.jarvis_tts_pb2_grpc as jtts_srv
import jarvis_api.audio_pb2 as ja

## Create Jarvis clients and connect to Jarvis Speech API server

In [6]:
channel = grpc.insecure_channel('localhost:50051')

jarvis_asr = jasr_srv.JarvisASRStub(channel)
jarvis_tts = jtts_srv.JarvisTTSStub(channel)

## ASR

In [8]:
ipd.Audio('common_voice_th_clean/clips/common_voice_th_23646618_trim.wav')

In [30]:
start_time = time.time()

with io.open('common_voice_th_clean/clips/common_voice_th_23646618_trim.wav', 'rb') as fh:
    content = fh.read()
req = jasr.RecognizeRequest()
req.audio = content                                   # raw bytes
req.config.encoding = ja.AudioEncoding.LINEAR_PCM     # Supports LINEAR_PCM, FLAC, MULAW and ALAW audio encodings
req.config.sample_rate_hertz = 22050                     # Audio will be resampled if necessary
req.config.language_code = "th-TH"                    # Ignored, will route to correct model in future release
req.config.max_alternatives = 1                       # How many top-N hypotheses to return
req.config.enable_automatic_punctuation = False        # Add punctuation when end of VAD detected
req.config.audio_channel_count = 1                    # Mono channel

response = jarvis_asr.Recognize(req)
asr_best_transcript = response.results[0].alternatives[0].transcript

print(time.time()-start_time)
print("ASR Transcript:", asr_best_transcript)

2.3895559310913086
ASR Transcript: พวก#เรา#อาจจะ#ต้องการ#ความ#ช่วยเหลือ#จาก#เธอ#ใน#ที่สุด 


## TTS

In [32]:
start_time = time.time()

req = jtts.SynthesizeSpeechRequest()
req.text = "ทดสอบ ระบบ ภาษาไทย".strip()
req.language_code = "th-TH"  
req.encoding = ja.AudioEncoding.FLAC     # Supports LINEAR_PCM, FLAC, MULAW and ALAW audio encodings
req.sample_rate_hz = 22050                     # ignored, audio returned will be 22.05KHz
req.voice_name = "tsync2"

resp = jarvis_tts.Synthesize(req)
audio_samples = np.frombuffer(resp.audio, dtype=np.float32)

print(time.time()-start_time)

ipd.Audio(audio_samples, rate=22050)

0.0882577896118164
