# Using Google Speech To Text

In [5]:
import io
import os

# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types

# os.system("export GOOGLE_APPLICATION_CREDENTIALS=./innate-solution-180816-a14c92025269.json")

%env GOOGLE_APPLICATION_CREDENTIALS=./innate-solution-180816-a14c92025269.json

# Instantiates a client
client = speech.SpeechClient()

# The name of the audio file to transcribe
file_name = "test1.wav"

# Loads the audio into memory
with io.open(file_name, 'rb') as audio_file:
    content = audio_file.read()
    audio = types.RecognitionAudio(content=content)

config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
#     sample_rate_hertz=8000,
    language_code='en-US')

# Detects speech in the audio file
response = client.recognize(config, audio)

for result in response.results:
    print('Transcript: {}'.format(result.alternatives[0].transcript))

env: GOOGLE_APPLICATION_CREDENTIALS=./innate-solution-180816-a14c92025269.json
Transcript: the Birch canoes lid on the smooth planks
Transcript:  glue the seat to the dark blue background
Transcript:  it is easy to tell the death of a well.
Transcript:  These days a chicken leg as a word dish.
Transcript:  Rice is often served in round bowls.
Transcript:  Did use of lemon snakes find punch.
Transcript:  The box was down beside the park truck.
Transcript:  the Hogs of the popcorn and garbage
Transcript:  4 hours of study work face to us
Transcript:  a large size in stockings is hard to sell.


# Uploading Larger Files

In [7]:
import io
import os

# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from scipy.io import wavfile

def createURI(bucket_name, file_path):
    return "gs://{}/{}".format(bucket_name,file_path)

def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    from google.cloud import speech
    from google.cloud.speech import enums
    from google.cloud.speech import types
    client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
#     sample_rate_hertz=8000,
    language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)
    
    output_list = []
    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))
        output_list.append(result.alternatives[0].transcript)
        
    output =""

    for out in output_list:
        output+=out
        
    return output
        
def cut_data(wav_file):
    fs, data = wavfile.read(wav_file)
    wavfile.write('mono_{}.wav'.format(wav_file),fs, data[:,0])
    return 'mono_{}.wav'.format(wav_file)

def generate_analysis(bucket_name,filepath):
    return transcribe_gcs(createURI(bucket_name,filepath))

In [8]:
generate_analysis('sumy','mono_test2_1.wav')

Waiting for operation to complete...
Transcript: sometimes math and physics conspire in ways that just feel too good to be true let's play a strange sort of mathematical croquet we're going to have to Sliding blocks and a wall the first block Stars by coming in at some velocity from the right while the second one starts out stationary being overly idealistic physicists let's assume that there's no friction and all of the collisions are perfectly elastic which means no energy is lost, you might complain that such collisions would make no sound but your goal here is going to be to count how many collisions take place so it's like conflict with that assumption I want to leave a little quack sound to better draw your attention to that count the simplest case is when both blocks have the same mass the first block Hits II transferring all of its momentum than the second one down says off the wall and then transfers all of its momentum back to the first which then sales up towards Infinity 3 

["sometimes math and physics conspire in ways that just feel too good to be true let's play a strange sort of mathematical croquet we're going to have to Sliding blocks and a wall the first block Stars by coming in at some velocity from the right while the second one starts out stationary being overly idealistic physicists let's assume that there's no friction and all of the collisions are perfectly elastic which means no energy is lost, you might complain that such collisions would make no sound but your goal here is going to be to count how many collisions take place so it's like conflict with that assumption I want to leave a little quack sound to better draw your attention to that count the simplest case is when both blocks have the same mass the first block Hits II transferring all of its momentum than the second one down says off the wall and then transfers all of its momentum back to the first which then sales up towards Infinity 3 total plaques",
 "if you all the relevant physi

## Checking the contents of the file

In [None]:
with open("hello.txt","w+") as f:
    f.write(output)

In [None]:
! cat hello.txt

# Cloud Storage

In [109]:
from google.cloud import storage

class Cloud():

    def __init__(self, dataset = "simulations"):
        self.APIKEY = "innate-solution-180816-a14c92025269.json"
        self.projectID = "innate-solution-180816"
        self.dataset = dataset
        
    def upload_blob(self, bucket_name, path, destination_name):
        """Uploads a file to the bucket."""
        storage_client = storage.Client.from_service_account_json(
        self.APIKEY)
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(destination_name)
        blob.upload_from_filename(path)

        print('File {} uploaded to {}.'.format(
            path,
            destination_name))
        
    def download_blob(self, bucket_name, source_name, destination_name):
        """Downloads a blob from the bucket."""
        storage_client = storage.Client.from_service_account_json(
        self.APIKEY)
        bucket = storage_client.get_bucket(bucket_name)
        blob = bucket.blob(source_name)

        blob.download_to_filename(destination_name)

        print('Blob {} downloaded to {}.'.format(
            source_name,
            destination_name))

sometimes math and physics conspire in ways that just feel too good to be true let's play a strange sort of mathematical croquet we're going to have to Sliding blocks and a wall the first block Stars by coming in at some velocity from the right while the second one starts out stationary being overly idealistic physicists let's assume that there's no friction and all of the collisions are perfectly elastic which means no energy is lost, you might complain that such collisions would make no sound but your goal here is going to be to count how many collisions take place so it's like conflict with that assumption I want to leave a little quack sound to better draw your attention to that count the simplest case is when both blocks have the same mass the first block Hits II transferring all of its momentum than the second one down says off the wall and then transfers all of its momentum back to the first which then sales up towards Infinity 3 total plaques.if you all the relevant physics in 

# Performing Summarization

In [2]:
import argparse

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict
    

def parse_arguments():
    """ Parse command line arguments """ 
    parser = argparse.ArgumentParser()
    parser.add_argument('filepath', help='File name of text to summarize')
    parser.add_argument('-l', '--length', default=4, help='Number of sentences to return')
    args = parser.parse_args()

    return args

def read_file(path):
    """ Read the file at designated path and throw exception if unable to do so """ 
    try:
        with open(path, 'r') as file:
            return file.read()

    except IOError as e:
        print("Fatal Error: File ({}) could not be locaeted or is not readable.".format(path))

def sanitize_input(data):
    """ 
    Currently just a whitespace remover. More thought will have to be given with how 
    to handle sanitzation and encoding in a way that most text files can be successfully
    parsed
    """
    replace = {
        ord('\f') : ' ',
        ord('\t') : ' ',
        ord('\n') : ' ',
        ord('\r') : None
    }

    return data.translate(replace)

def tokenize_content(content):
    """
    Accept the content and produce a list of tokenized sentences, 
    a list of tokenized words, and then a list of the tokenized words
    with stop words built from NLTK corpus and Python string class filtred out. 
    """
    stop_words = set(stopwords.words('english') + list(punctuation))
    words = word_tokenize(content.lower())
    
    return [
        sent_tokenize(content),
        [word for word in words if word not in stop_words]    
    ]

def score_tokens(filterd_words, sentence_tokens):
    """
    Builds a frequency map based on the filtered list of words and 
    uses this to produce a map of each sentence and its total score
    """
    word_freq = FreqDist(filterd_words)

    ranking = defaultdict(int)

    for i, sentence in enumerate(sentence_tokens):
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                ranking[i] += word_freq[word]

    return ranking

def summarize(ranks, sentences, length):
    """
    Utilizes a ranking map produced by score_token to extract
    the highest ranking sentences in order after converting from
    array to string.  
    """
    if int(length) > len(sentences): 
        print("Error, more sentences requested than available. Use --l (--length) flag to adjust.")
        exit()

    indexes = nlargest(length, ranks, key=ranks.get)
    final_sentences = [sentences[j] for j in sorted(indexes)]
    return ' '.join(final_sentences) 


In [7]:
content = read_file("hello.txt")
content = sanitize_input(content)

sentence_tokens, word_tokens = tokenize_content(content)  
sentence_ranks = score_tokens(word_tokens, sentence_tokens)

summarize(sentence_ranks, sentence_tokens, 1)

"okay 314 clax if the first block was 1 million times the mass of the other then again with all of our crazy idealistic conditions almost all of the klax happened in one big burst this time resulting in a total of 3141 collisions perhaps you see the pattern here though it's forgivable if you don't since it defies all expectation when the mass of that first block is some power of 100 times the mass of the second the total number of collisions have the same digits as pie this absolutely blew my mind when it was first shared with me read it to the viewer Henry Cavill for introducing me to this fact which was originally discovered by the mathematician in 1995 and published in 2003. what is what I love about this is that if ever they were Olympic Games for algorithms that compute Pi this one would have to win medals both for being the most elegant and for being the most commonly inefficient I mean think about the actual algorithm here Step One Employment to physics engine step to choose the