In [2]:
# ! pip install google-cloud-texttospeech

In [3]:
import os
import io

from IPython.display import Audio
from google.cloud import speech_v1
from google.cloud import texttospeech_v1

In [4]:
os.environ["PROJECT_ID"] = "text-analysis-323506"

### Create a new Google Cloud service account to access the APIs programmatically.

__Don't run the following cell if you already have setup a cloud service account and have it's associated key__.

Following command creates a Google Cloud Service account, which is required to access APIs. This command should be run only once. When ran for the first time, it create a service account, details of which can be found in IAM & Admin section of GCP console.

After that, it creates a Google Cloud Service account key got the newly created account. Key is stored at /home/jupyter directory. Download this key. Upload it the next time when this notebook is to be run again. __Following cell should not be run more than once, unless we want to create a new cloud service account__.

In [4]:
%%bash

# These should be run only once. These commands setup a cloud service account with required permissions to call APIs
gcloud iam service-accounts create my-api-sa --display-name "api account"
gcloud iam service-accounts keys create ~/key.json --iam-account my-api-sa@${PROJECT_ID}.iam.gserviceaccount.com

created key [a0d2ba2a2b6e608290c9ce00a3a0e93f3fcf2f2e] of type [json] as [/home/jupyter/key.json] for [my-api-sa@text-analysis-323506.iam.gserviceaccount.com]


Following command stores path to json file in an environment variable. This is required to access API endpoints using python.

In [209]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.path.abspath("/home/jupyter/key.json")

#### Create a Google Cloud speech api client object

In [210]:
speech_client = speech_v1.SpeechClient()

### Important: 
#### Enable Cloud Speech-to-Text API and Cloud Text-to-Speech API for this project in Google Cloud Platform console.

## Transcribe speech

In this i'm using an audio file i created. I have converted the audio format from m4a to flac as it's a lossless format

In [5]:
speech_file = './data/recording.flac'

In [6]:
Audio(speech_file)

In [108]:
with io.open(speech_file, "rb") as audio_file:
    content = audio_file.read()

In [109]:
audio = speech.RecognitionAudio(content=content)

In [110]:
speech_config =  {"language_code": "en-In",
                  "encoding": speech_v1.RecognitionConfig.AudioEncoding.FLAC,
                 'audio_channel_count' : 2 
                 }

In [111]:
response = speech_client.recognize(config=speech_config, audio=audio)

#### Transcribed speech

In [127]:
response.results[0].alternatives[0].transcript

'hi I am Subramaniam Joshi I am a data scientist at Hewlett Packard enterprise I love machine learning and artificial intelligence'

API was able to successfully convert speech to text. It couldn't add punctuations but transcription is accurate

#### Complete Response

In [129]:
response

results {
  alternatives {
    transcript: "hi I am Subramaniam Joshi I am a data scientist at Hewlett Packard enterprise I love machine learning and artificial intelligence"
    confidence: 0.8581835627555847
  }
}
total_billed_time {
  seconds: 15
}

## Text-to-Speech Conversion 

In [202]:
tts_client = texttospeech_v1.TextToSpeechClient()

In [203]:
text = "hi I'm Subrahmanya Joshi. I'm a data scientist at Hewlett Packard Enterprise. I love machine learning and artificial intelligence."

In [204]:
input_text = texttospeech_v1.SynthesisInput(text=text)

#### Following cell lists all available voice configurations 

In [None]:
tts_client.list_voices()

In [194]:
# Here i'm using an en-In configuration with name en-IN-Wavenet-D
voice = texttospeech_v1.VoiceSelectionParams(
        language_code="en-In",
        name="en-IN-Wavenet-D")

In [195]:
# Reponse audio configurations
audio_config = texttospeech_v1.AudioConfig(
        audio_encoding=texttospeech_v1.AudioEncoding.MP3)

In [196]:
response = tts_client.synthesize_speech(
        request={"input": input_text, "voice": voice, "audio_config": audio_config})

#### Playing response

In [197]:
Audio(response.audio_content)