# Pseudocode for steps to follow:
1. Take audio and get transcription results from speech to text API.
2. Use the time difference between utterances to check for chunks of silence.
3. Using that split the audio into speech, then silence chunks (later or split to silence and speech fillers).
4. Go through the silence chunks one after the other dropping/ shortening the silence in between words while getting user feedback:
    * #### Ideas to pursue for this
        1. Play short clip of audio (before silence chunk + silence chunk + after silence chunk) and get feedback about how it sounds.

In [47]:
from audio_utilities import transcribe_audio_file, record_audio, play_audio
import IPython, numpy as np, scipy as sp, matplotlib.pyplot as plt, matplotlib, sklearn, librosa

In [48]:
RESULTS = transcribe_audio_file('./reading_article_with_silences.wav')

In [30]:
class WordTimeStamp:
    word : str
    start_time : float
    end_time : float
    
    def __init__(self, word, start, end):
        self.word = word
        self.start_time = start
        self.end_time = end  
    
    def __str__(self):
        return "[" + self.word + ", " + str(self.start_time) + ", " + str(self.end_time) + "]"
    
    def __repr__(self):
        return self.__str__()

# get the transcript and timestamps
def get_transcript_and_timestamps(results):
    transcript = ""
    timestamps = []
    for r in results['results']:
        transcript += r['alternatives'][0]['transcript']
        cur_time_stamps = r['alternatives'][0]['timestamps']
        
        for t in cur_time_stamps:
            timestamps.append(WordTimeStamp(t[0], t[1], t[2]))
    
    return transcript, timestamps


# gets the timestamps where a hesitation is indicated on the transcript
def get_hesitation_timestamps_from_timestamps(timestamps):
    hesitations = []
    for t in timestamps:
        if t.word == "%HESITATION":
            hesitations.append(t)
    
    return hesitations


# calculates the time 
def get_time_differences_between_each_utterance(timestamps):
    time_diff = [0]
    
    for i in range(len(timestamps)-1):
        time_diff.append(timestamps[i+1].start_time - timestamps[i].end_time)
    
    return time_diff


def get_utterances_with_threshold_silence_between_them(time_diffs, timestamps):
    special_utterances = []
    for i in range(1, len(time_diffs)):
        if time_diffs[i] > .5 or not(timestamps[i-1].word == '%HESITATION' or timestamps[i].word == '%HESITATION'):
#             print(timestamps[i].word)
#             print(timestamps[i-1].word)
            special_utterances.append([timestamps[i-1], timestamps[i]])
    return special_utterances


# returns 2 arrays (array 1: word chunks, array 2: silence chunks)
def augment_audio(audio, sr, timestamps):
    speech_chunks = []
    silence_chunks = []
    prev_marker = []
    
    for t in timestamps:
        silence_chunks.append(audio[int(prev * sr): int(t.start_time * sr)])
        prev = t.end_time
        speech_chunks.append(audio[int(t.start_time): int(t.end_time)])
    
    
    return speech_chunks, silence_chunks


# splits audio into speech and silence chunks with a specified threshold for silence length
def augment_audio_with_threshold(audio, sr, timestamps, threshold):
    speech_chunks = []
    silence_chunks = []    
    start = 0
    end = 1
    
    # NB: If the silence is too short it gets thrown out
    if timestamps[0].start_time > threshold:
        silence_chunks.append(audio[:int(timestamps[0].start_time * sr)])
    
    while end < len(timestamps):
        if timestamps[end].start_time - timestamps[end - 1].end_time > threshold:
            speech_chunks.append(audio[int(timestamps[start].start_time * sr): int(timestamps[end-1].end_time * sr)])
            silence_chunks.append(audio[int(timestamps[end-1].end_time * sr): int(timestamps[end].start_time * sr)])
            start = end
        
        end += 1
    speech_chunks.append(audio[int(timestamps[start].start_time * sr) :])
    
    return speech_chunks, silence_chunks
        

def remove_long_pauses_from_audio(audio, sr, pauses):
    final_audio = []
    start_end_indices_to_remove = []
    
    # extract indices
    for p in pauses:
        start_index = int(p[0].end_time * sr)
        end_index = int(p[1].start_time * sr)
        start_end_indices_to_remove.append([start_index, end_index])
        
    # remove the silences
    prev = 0
    for i in start_end_indices_to_remove:
        final_audio.extend(audio[prev: i[0]])
        prev = i[1]
    
    return final_audio
            

In [42]:
from IPython.display import Audio
global RESULTS
results_1 = RESULTS
transcript, timestamps = get_transcript_and_timestamps(results_1)
# print(transcript)
# hesitations = get_hesitation_timestamps_from_timestamps(timestamps)
# print(hesitations)
# time_diff = get_time_differences_between_each_utterance(timestamps)
# words_with_long_silence_after = get_utterances_with_threshold_silence_between_them(time_diff, timestamps)
# print(words_with_long_silence_after)
own_voice, sr = librosa.load('./reading_article_with_silences.wav')
# final_audio = remove_long_pauses_from_audio(own_voice, sr, words_with_long_silence_after)

_speech, _silence = augment_audio_with_threshold(own_voice, sr, timestamps, 1)

In [43]:
print(len(_speech))
print(len(timestamps))
print(len(_silence))

3
60
3


In [45]:
IPython.display.display(IPython.display.Audio(own_voice, rate=sr))
for i in range(len(_speech)):
    print(len(_speech[i]))
    IPython.display.display(IPython.display.Audio(_speech[i], rate = sr))

249165


96359


165088
