# Using open source google speech to text

In [37]:
import speech_recognition as sr

sr.__version__

'3.8.1'

In [38]:
rec = sr.Recognizer()

In [43]:
aud_data = sr.AudioFile('cv2.flac')
with aud_data as source:
    # reduction of bg noise, 0.2 makes sure that first word isnt cutout
    rec.adjust_for_ambient_noise(source,duration = 0.2)
    audio = rec.record(source)
type(audio)

speech_recognition.AudioData

In [44]:
try:
    print('Text output: \n'+rec.recognize_google(audio))
except:
    print('Audio file cannot be transcripted')
# this nural net has a confidence of around 70%

Text output: 
hi my name is Abhijit Kulkarni postgraduate International Hospitality Management from Auckland University of Technology New Zealand in this industry is my passion for food and beverage department education is supported by experience 6566 of industrial training in five star properties like Taj residency and sons and both in India currently I am working with Rendezvous hospitality group New Zealand 5th may 2009 and focused and dedicated towards any job given to me also a very fast learner personal speaking I like to work under pressure because I think it is false confidence with me for example the club Lounge I am handling at the moment is very busy for both morning breakfast as well as evening drink but I'm unable to handle this very effectively I tell that is Because lot of us have given positive feedback about my service to the management also whenever a restaurant get busy mysterious tense me to give them a heart I take this as a reward my student maximum experience and

# Using cloud speech api

In [1]:
# Imports the Google Cloud client library

import io
import os
from google.cloud import speech_v1p1beta1
from google.cloud import speech_v1
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from gcloud import storage
from oauth2client.service_account import ServiceAccountCredentials
import audioread

In [57]:
# configuration setup

"""
input formats are FLAC(lossless) or wav only
preferred bitrate > 16000
for multiple speakers each person's voice should be in different channels
for single speaker mono audio channel is preferred (not a compulsion)
"""

separation = False

primary_language_code = "en" # english (giving en-US,en-IN,en-UK increases accuracy)



path_to_directory = "/home/subbu/PRML/Internship/p1/"
path_to_bucket = "gs://rankingtranscript1/"
file_name = "cv2.flac" 


In [58]:
# feeding in the configurations

client = speech_v1.SpeechClient() # use speech_v1p1beta1.SpeechClient() to detect more languages automatically


with audioread.audio_open(path_to_directory+file_name) as ft:
    # detecting audio format for better accuracy
    print(ft.channels, ft.samplerate, ft.duration)
    num_channels = ft.channels
    sample_bit_rate = ft.samplerate
    duration = ft.duration
    ft.close()

if(duration > 58):
    lengthy = True
else:
    lengthy = False
    
    

if(lengthy):
    # uploading file into bucket(associated with service account) with same file name
    print('starting upload')
    storage_client = storage.Client.from_service_account_json(
        '/home/subbu/PRML/Internship/p1/rankingtranscript-96316cee08ba.json')
    bucket = storage_client.get_bucket('rankingtranscript1')
    blob = bucket.blob(file_name)
    blob.upload_from_filename(path_to_directory+file_name)
    look = True
    print('uploaded')
    # using api on uploaded file
    storage_uri = path_to_bucket+file_name
    audio = {"uri": storage_uri}
else:
    # short files dont require cloud storage
    file_path = path_to_directory+file_name
    with io.open(file_path, "rb") as f:
        content = f.read()
        f.close()
    audio = {"content": content}
    

config = {
        "sample_rate_hertz": sample_bit_rate,
        "audio_channel_count": num_channels,
        "enable_separate_recognition_per_channel": separation,
        "language_code": primary_language_code,
        "enable_automatic_punctuation": True,
        "use_enhanced": True,
    }

2 44100 101.0
starting upload
uploaded


In [59]:
# processing (takes time)

if(lengthy):
    operation = client.long_running_recognize(config, audio)
    response = operation.result()
else:
    response = client.recognize(config, audio)

In [60]:
# printing output

tot = 0
cnt = 0
for result in response.results:
    tot = tot + result.alternatives[0].confidence
    cnt = cnt + 1
    alternative = result.alternatives[0]
    if(separation):
        print(str(result.channel_tag)+" : "+alternative.transcript)
    else:
        print(alternative.transcript)
    
print("confidence = "+str(int((tot/cnt)*100)))

My name is Abhishek Kulkarni postgraduate International Hospitality Management from Auckland University of Technology New Zealand industry is my passion for food and education is supported by my experience in statistics of industrial training in 5 star properties like Taj residency and son and front both in India currently working with from the letter group New Zealand 2009 and focused and dedicated towards any job given to me also a very fast learner speaking. I like to work under pressure because I think it is false confidence with me. I am handling at the moment is very busy for both morning breakfast as well as evening drink, but I'm unable to handle this very effectively. I cancel this because lot of us have given positive feedback also whenever Bar and Restaurant get busy mysterious tense me to give the my heart. I take this as a reward student maximum experience and knowledge with fnb department.
Take me to the responsible position within the same department and I am sure that m

In [61]:
# Deleting uploaded file
"""
The blob pipe will be deleted with in short time
of its creation. so delete it as soon as response is
obtained
"""

if(look):
    blob.delete() # you can run this only once (it deletes the uploaded file)
    print('file deleted')
    look = False

file deleted


### Addons

In [None]:
# code to add speaker diarization instead of channels
"""
speaker diarization is in beta stage so we dont use this for our project presently
the audio file input should be mono audio channel
"""

# add this to feeding cell
client = speech_v1p1beta1.SpeechClient() # beta version

separation = True
num_speakers = 2 # 3 at max

config = {
        "sample_rate_hertz": sample_bit_rate,
        "audio_channel_count": num_channels,
        "enable_speaker_diarization": separation,
        "diarization_speaker_count": num_speakers,
        "language_code": primary_language_code,
    }

# add this to printing cell
if(separation):
    # tags are assigned to each word so look for change of tag
    first = True
    tag = ""
    print()
    for word in alternative.words:
        if(first):
            tag = word.speaker_tag
            first = False
            print(str(tag)+" :",end = " ")
            print(word.word,end=" ")
        else:
            if(tag == word.speaker_tag):
                print(word.word,end=" ")
            else:
                print()
                print(str(tag)+" :",end = " ")
                print(word.word,end=" ")
                tag = word.speaker_tag

In [None]:
# code to add auto language detection
"""
This feature also in beta stage hence we are not using presently
"""

# add this to feeding cell
client = speech_v1p1beta1.SpeechClient() # beta version

alternative_language_codes = [
    "es", # spanish
    "fr", # french
    "hi", # hindi
] # we can add upto 3 languages to detect automatically

config = {
        "sample_rate_hertz": sample_bit_rate,
        "audio_channel_count": num_channels,
        "enable_separate_recognition_per_channel": separation,
        "language_code": primary_language_code,
        "enable_automatic_punctuation": True,
        "use_enhanced": True,
        "alternative_language_codes": alternative_language_codes,
    }

# add to printing cell to know the language
print(result.language_code)