# Pseudocode for steps to follow:
1. Take audio and get transcription results from speech to text API.
2. Use the time difference between utterances to check for chunks of silence.
3. Using that split the audio into speech, then silence chunks (later or split to silence and speech fillers).
4. Go through the silence chunks one after the other dropping/ shortening the silence in between words while getting user feedback:
    * #### Ideas to pursue for this
        1. Play short clip of audio (before silence chunk + silence chunk + after silence chunk) and get feedback about how it sounds.
        2. Refine where the sample audio to listen to gets cut at to match beginning of a word.

In [7]:
from audio_utilities import transcribe_audio_file, record_audio, play_audio
from IPython.display import clear_output
from typing import Optional
from enum import Enum, auto
import IPython, numpy as np, scipy as sp, matplotlib.pyplot as plt, matplotlib, sklearn, librosa, copy, scipy.io.wavfile

In [66]:
RESULTS = transcribe_audio_file('./hesitations_test2.wav')
print(RESULTS)

{'results': [{'alternatives': [{'timestamps': [['hi', 0.25, 0.65]], 'confidence': 0.83, 'transcript': 'hi '}], 'final': True}, {'alternatives': [{'timestamps': [['my', 2.21, 2.47], ['name', 2.47, 2.84], ['is', 2.84, 3.22]], 'confidence': 0.93, 'transcript': 'my name is '}], 'final': True}, {'alternatives': [{'timestamps': [['%HESITATION', 4.38, 5.04]], 'confidence': 0.99, 'transcript': '%HESITATION '}], 'final': True}, {'alternatives': [{'timestamps': [['Amir', 6.82, 7.3], ['saja', 7.3, 7.9]], 'confidence': 0.19, 'transcript': 'Amir saja '}], 'final': True}, {'alternatives': [{'timestamps': [['I', 9.66, 10.11], ['am', 10.17, 10.69]], 'confidence': 0.31, 'transcript': 'I am '}], 'final': True}, {'alternatives': [{'timestamps': [['our', 11.91, 12.46]], 'confidence': 0.45, 'transcript': 'our '}], 'final': True}, {'alternatives': [{'timestamps': [['our', 14.58, 14.75], ['first', 14.75, 15.37], ['year', 15.41, 15.72], ['PhD', 15.72, 16.27], ['student', 16.27, 17], ['at', 17.4, 17.87]], 'con

In [9]:
class WordTimeStamp:
    word : str
    start_time : float
    end_time : float
    
    def __init__(self, word, start, end):
        self.word = word
        self.start_time = start
        self.end_time = end  
    
    def __str__(self):
        return "[" + self.word + ", " + str(self.start_time) + ", " + str(self.end_time) + "]"
    
    def __repr__(self):
        return self.__str__()


class NonSpeechType(Enum):
    "Represents if non-speech audio is silence or non-silence"
    SILENCE = auto()
    NOT_SILENCE = auto()

    
class NonSpeechData:
    sound_type: NonSpeechType
    data: [int]
    def __init__(self, d, st: NonSpeechType):
        self.data = d
        self.sound_type = st


# get the transcript and timestamps
def get_transcript_and_timestamps(results):
    transcript = ""
    timestamps = []
    for r in results['results']:
        transcript += r['alternatives'][0]['transcript']
        cur_time_stamps = r['alternatives'][0]['timestamps']
        
        for t in cur_time_stamps:
            timestamps.append(WordTimeStamp(t[0], t[1], t[2]))
    
    return transcript, timestamps


# gets the timestamps where a hesitation is indicated on the transcript
def get_hesitation_timestamps_from_timestamps(timestamps):
    hesitations = []
    for t in timestamps:
        if t.word == "%HESITATION":
            hesitations.append(t)
    
    return hesitations



# splits audio into speech and silence chunks with a specified threshold for silence length
# def augment_audio_with_threshold_new(audio, sr, timestamps, threshold):
#     hesitation_timestamps = get_hesitation_timestamps_from_timestamps(timestamps)
#     speech_timestamps = [x for x in timestamps if x not in get_hesitation_timestamps_from_timestamps(timestamps)]
    
#     speech_chunks = []
#     non_speech_chunks = []
    
#     silence_and _chunks = []  
#     hesitation_chunks = []
#     start = 0
#     end = 1
    
#     # NB: If the silence is too short it gets thrown out
#     if speech_timestamps[start].start_time > threshold:
#         silence_chunks.append(SilenceOrFiller(audio[:int(speech_timestamps[start].start_time * sr)], False))
    
    
#     while end < len(speech_timestamps):
#         if speech_timestamps[end].start_time - speech_timestamps[end-1].end_time > threshold:
#             h_timestamp = check_if_hesitation_in_between(hesitation_timestamps,
#                                                          speech_timestamps[end-1].end_time,
#                                                         speech_timestamps[end].start_time)
#             speech_chunks.append(audio[int(speech_timestamps[start].start_time * sr): int(speech_timestamps[end-1].end_time * sr)])
#             if h_timestamp is not None:
# #                 speech_chunks.append(audio[int(speech_timestamps[start].start_time * sr): int(speech_timestamps[end-1].end_time * sr)])
#                 hesitation_chunks.append(SilenceOrFiller(audio[int(speech_timestamps[end-1].end_time * sr): int(speech_timestamps[end].start_time * sr)], False))
#             else:
# #                 speech_chunks.append(audio[int(speech_timestamps[start].start_time * sr): int(speech_timestamps[end-1].end_time * sr)])
#                 silence_chunks.append(SilenceOrFiller(audio[int(speech_timestamps[end-1].end_time * sr): int(speech_timestamps[end].start_time * sr)], True))
#             start = end
        
#         end += 1
        
#     speech_chunks.append(audio[int(speech_timestamps[start].start_time * sr) :])
#     return speech_chunks, silence_chunks, hesitation_chunks

# splits audio into speech and silence chunks with a specified threshold for silence length
def augment_audio_with_threshold(audio, sr, timestamps, threshold):
    # TODO: This assumes that there is hesitation in the speech
    
    hesitation_timestamps = get_hesitation_timestamps_from_timestamps(timestamps)
    speech_timestamps = [x for x in timestamps if x not in get_hesitation_timestamps_from_timestamps(timestamps)]
    
    speech_chunks = []
    non_speech_chunks = []

    start = 0
    end = 1
    
    """
    Beginning of audio edited in the following ways:
        - If it begins with speech straight away, we skip this case
        - If it begins with just silence, the silence is truncated to the threshold
          , labeled as silence and appended to the non-speech chunks.
        - If it begins with just non-speech with audio, we append the full chunk to 
          non-speech chunks and label it as non-speech sound
        - If it begins with both a hesitation and silence, the two are combined and
          labeled as non-speech sound, then appended to the non-speech chunks
    """
    if speech_timestamps[0].start_time >= 0:
        # just silence with no hesitation
        if len(hesitation_timestamps) < 1 or speech_timestamps[0].start_time < hesitation_timestamps[0].start_time:
            if speech_timestamps[start].start_time > threshold:
                silence_chunks.append(NonSpeechData(audio[: int(threshold * sr)], NonSpeechType.SILENCE))
            else:
                silence_chunks.append(NonSpeechData(audio[: int(speech_timestamps[0].start_time * sr)], NonSpeechType.SILENCE))
        
        # non-speech audio is in the beginning chunk
        else:
            silence_chunks.append(NonSpeechData(audio[:int(speech_chunks[0].start_time * sr)], NonSpeechType.NOT_SILENCE))
        
    
    while end < len(speech_timestamps):
        if speech_timestamps[end].start_time - speech_timestamps[end-1].end_time > threshold:
            h_timestamp = check_if_hesitation_in_between(hesitation_timestamps,
                                                         speech_timestamps[end-1].end_time,
                                                        speech_timestamps[end].start_time)
            speech_chunks.append(audio[int(speech_timestamps[start].start_time * sr): int(speech_timestamps[end-1].end_time * sr)])
            
            if h_timestamp:
                silence_chunks.append(NonSpeechData(audio[int(speech_timestamps[end-1].end_time * sr) : int(speech_timestamps[end].start_time * sr)], NonSpeechType.NOT_SILENCE))
            else:
                silence_chunks.append(NonSpeechData(audio[int(speech_timestamps[end-1].end_time * sr): int(speech_timestamps[end].start_time * sr)][: int(threshold * sr)], NonSpeechType.SILENCE))
            
            start = end
        
        end += 1
    
    # append the final speech chunk
    speech_chunks.append(audio[int(speech_timestamps[start].start_time * sr) :])
    
    # TODO: The last chunks whether silence or non-speech audio gets tossed out
    return speech_chunks, silence_chunks

def check_if_hesitation_in_between(h_timestamps, start_time, end_time) -> Optional[WordTimeStamp]:
    h_t = None
    
    for h in h_timestamps:        
        if h.start_time >= start_time and h.end_time <= end_time:
            return h
    return None        

def wavwrite(filepath, data, sr, norm=True, dtype='int16',):
    '''
    Write wave file using scipy.io.wavefile.write, converting from a float (-1.0 : 1.0) numpy array to an integer array
    
    Parameters
    ----------
    filepath : str
        The path of the output .wav file
    data : np.array
        The float-type audio array
    sr : int
        The sampling rate
    norm : bool
        If True, normalize the audio to -1.0 to 1.0 before converting integer
    dtype : str
        The output type. Typically leave this at the default of 'int16'.
    '''
    if norm:
        data /= np.max(np.abs(data))
    data = data * np.iinfo(dtype).max
    data = data.astype(dtype)
    scipy.io.wavfile.write(filepath, sr, data)
            

In [63]:
def simple_demo(speech_chunks, silence_chunks, audio, sr):
    final_audio = []
    speech_counter = 0
    silence_counter = 0
    satisfied = False
    
    if len(silence_chunks) == len(speech_chunks): # silence starts the audio clip
        print('This audio file begins with a silent period, would you like to remove it (1)\
        , add it in full (2), or edit it (3).')
        # for now assume the silence is added:
        final_audio.extend(copy.deepcopy(silence_chunks[0]))
        silence_counter += 1
    
    final_audio.extend(copy.deepcopy(speech_chunks[speech_counter]))
    
    while speech_counter < (len(speech_chunks) - 1) or silence_counter < (len(silence_chunks) - 1):
        cur_silence_chunk = copy.deepcopy(silence_chunks[silence_counter])
        
        satisfied = False
        while not satisfied:
            print('Playing the last 5 seconds of the current final audio...')
            
            temp_audio = []
            
            # play 5 seconds of audio or all of it if currently not longer than 5
            if len(final_audio) / sr > 5:
                temp_audio = copy.deepcopy(final_audio[len(final_audio) - (5 * sr):])
            else:
                temp_audio = copy.deepcopy(final_audio)
            
            IPython.display.display(IPython.display.Audio(temp_audio, rate=sr))
            input('Please press any key when done listening to continue: ')
            clear_output()
            
            print('Playing the silence chunk we are currently editing...')
            # play the silence_chunk
            IPython.display.display(IPython.display.Audio(silence_chunks[silence_counter], rate=sr))
            input('Please press any key when done listening to continue: ')
            clear_output()
            
            # shorten the silence_chunk
            shorten_ratio = float(input('Please enter how much would you like to shorten the silence chunk by (fraction)?: '))
            cur_silence_chunk = copy.deepcopy(cur_silence_chunk[: int(len(cur_silence_chunk) * shorten_ratio)])
    
            # concate to the last x seconds of current recording, append silence, and append
            # next x seconds of next speech chunk
            temp_audio.extend(copy.deepcopy(cur_silence_chunk))
            
            if len(speech_chunks[speech_counter + 1]) > (5 * sr):
                # get only first 5
                temp_audio.extend(copy.deepcopy(speech_chunks[speech_counter + 1][:int(5 * sr)]))
            else:
                temp_audio.extend(copy.deepcopy(speech_chunks[speech_counter + 1]))
            
            # play the new piece
            print('\n\nPlaying how the audio file now sounds at this point...')
            IPython.display.display(IPython.display.Audio(temp_audio, rate=sr))
            
            input('Press any key when done listening to continue: ')
            clear_output()
            
            # get input on if satisfied
            response = input('Press (y) to confirm this edit or (n) to change the amount to shorten by: ')
            if response == 'y':
                # append final chunks and break
                final_audio.extend(copy.deepcopy(cur_silence_chunk))
                final_audio.extend(copy.deepcopy(speech_chunks[speech_counter + 1]))
                speech_counter += 1
                silence_counter += 1
                break
    
    wavwrite('./edited_audio.wav', final_audio, sr)
    print('Exiting....')
    return

In [73]:
from IPython.display import Audio
global RESULTS
results_1 = RESULTS
transcript, timestamps = get_transcript_and_timestamps(results_1)
hesitation_timestamps = get_hesitation_timestamps_from_timestamps(timestamps)
print(transcript)

# print(list(set(timestamps) - set(hesitation_timestamps)))
own_voice, sr = librosa.load('./hesitations_test2.wav')
# _speech, _silence = augment_audio_with_threshold(own_voice, sr, timestamps, 1)

hi my name is %HESITATION Amir saja I am our our first year PhD student at %HESITATION Northwestern University my research is related to accessibility %HESITATION I am taking the mission perception 


In [75]:
# simple_demo(_speech, _silence, own_voice, sr)
# print(RESULTS)
_speech, _silence, _hesitation = augment_audio_with_threshold_new(own_voice, sr, timestamps, 0.2)
print(len(_hesitation))
print(len(_silence))
print(len(_speech))
print('this is the full audio')
IPython.display.display(IPython.display.Audio(own_voice, rate=sr))

print('these are the silences')
for i in range(len(_silence)):
    if len(_silence[i].data) > 0:
        IPython.display.display(IPython.display.Audio(_silence[i].data, rate=sr))

print('these are the hesitations')
for i in range(len(_hesitation)):
    if len(_hesitation[i].data) > 0:
        IPython.display.display(IPython.display.Audio(_hesitation[i].data, rate=sr))


[[%HESITATION, 4.38, 5.04], [%HESITATION, 19.92, 20.57], [%HESITATION, 36.59, 37.43]]
3
9
12
this is the full audio


these are the silences


these are the hesitations
