### Test Google ASR

In [6]:
import argparse
from google.cloud import speech


def transcribe_file_with_auto_punctuation(path: str) -> speech.RecognizeResponse:
    """Transcribe the given audio file with auto punctuation enabled."""
    client = speech.SpeechClient()

    # path = 'resources/commercial_mono.wav'
    with open(path, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        # sample_rate_hertz=44100,
        language_code="yue-HK",
        # Enable automatic punctuation
        enable_automatic_punctuation=True,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print(f"First alternative of result {i}")
        print(f"Transcript: {alternative.transcript}")

    return response

In [7]:
response = transcribe_file_with_auto_punctuation(
    "test.wav"
)

--------------------
First alternative of result 0
Transcript: 你好你今日過得點呀


In [8]:
response.results

[alternatives {
  transcript: "你好你今日過得點呀"
  confidence: 0.933110237
}
result_end_time {
  seconds: 2
  nanos: 520000000
}
language_code: "yue-hant-hk"
]

In [9]:
def list_voices():
    """Lists the available voices."""
    from google.cloud import texttospeech

    client = texttospeech.TextToSpeechClient()

    # Performs the list voices request
    voices = client.list_voices()

    for voice in voices.voices:
        # Display the voice's name. Example: tpc-vocoded
        print(f"Name: {voice.name}")

        # Display the supported language codes for this voice. Example: "en-US"
        for language_code in voice.language_codes:
            print(f"Supported language: {language_code}")

        ssml_gender = texttospeech.SsmlVoiceGender(voice.ssml_gender)

        # Display the SSML Voice Gender
        print(f"SSML Voice Gender: {ssml_gender.name}")

        # Display the natural sample rate hertz for this voice. Example: 24000
        print(f"Natural Sample Rate Hertz: {voice.natural_sample_rate_hertz}\n")
    return voices

In [10]:
from google.cloud import texttospeech


def synthesize_text_with_audio_profile(text, output, effects_profile_id: str = ""):
    """Synthesizes speech from the input string of text."""
    client = texttospeech.TextToSpeechClient()

    input_text = texttospeech.SynthesisInput(text=text)

    # Note: the voice can also be specified by name.
    # Names of voices can be retrieved with client.list_voices().
    voice = texttospeech.VoiceSelectionParams(
        language_code="yue-HK",
        ssml_gender = texttospeech.SsmlVoiceGender.FEMALE,
        name = "yue-HK-Standard-A",
    )

    # Note: you can pass in multiple effects_profile_id. They will be applied
    # in the same order they are provided.
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.LINEAR16,
        effects_profile_id=[effects_profile_id],
    )

    response = client.synthesize_speech(
        input=input_text, voice=voice, audio_config=audio_config
    )

    # The response's audio_content is binary.
    with open(output, "wb") as out:
        out.write(response.audio_content)
        print('Audio content written to file "%s"' % output)
    return output

In [11]:
synthesize_text_with_audio_profile(text="你好，你今日过的点啊", output="test.wav", effects_profile_id="telephony-class-application")

Audio content written to file "test.wav"


'test.wav'