In [35]:
from faster_whisper import WhisperModel
from transformers import AutoTokenizer

model_size = "distil-large-v2"

# Run on GPU with FP16
model = WhisperModel(model_size, device="cuda", compute_type="float16")
stt_tokenizer = AutoTokenizer.from_pretrained("distil-whisper/distil-large-v2")

import pyaudio
import numpy as np

def capture_audio_continuous(sample_rate=16000, chunk_size=1024, channels=1):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16,
                    channels=channels,
                    rate=sample_rate,
                    input=True,
                    frames_per_buffer=chunk_size)
    
    try:
        while True:
            data = stream.read(chunk_size)
            yield np.frombuffer(data, dtype=np.int16)
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()

def longest_common_prefix(str1, str2):
    # Find the minimum length of the two strings
    min_length = min(len(str1), len(str2))
    
    # Iterate through characters up to the minimum length
    for i in range(min_length):
        # If we find a mismatch, return the prefix up to this point
        if str1[i] != str2[i]:
            return str1[:i]
    
    # If we reached the end of the loop without finding a mismatch,
    # the entire shorter string is a prefix of the longer one
    return str1[:min_length]

def longest_common_prefix_word_level(word_list_1, word_list_2):
    # Find the minimum length of the two strings
    min_length = min(len(word_list_1), len(word_list_2))
    
    # Iterate through characters up to the minimum length
    for i in range(min_length):
        # If we find a mismatch, return the prefix up to this point
        if word_list_1[i].word != word_list_2[i].word:
            if len(word_list_1[:i]) > 0:
                return word_list_1[:i], word_list_1[:i][0].start, word_list_1[i].start
            else:
                return [], None, None
    
    # If we reached the end of the loop without finding a mismatch,
    # the entire shorter string is a prefix of the longer one
    lcp = word_list_1[:min_length]
    return lcp, lcp[0].start, lcp[-1].end



In [None]:
final_text = ""
previous_txt_seg = []
for segment in segments_list:
    if previous_txt_seg != []:
        confirmed_txt_seg, start, end = longest_common_prefix_word_level(previous_txt_seg, segment)
        final_text += "".join([i.word for i in confirmed_txt_seg])
        previous_txt_seg = segment[len(confirmed_txt_seg):]
    else:
        previous_txt_seg = segment
    print(final_text)

In [38]:

def get_text_segment(words):
    return "".join([i.word for i in words])

sample_rate = 16000
chunk_size = 1024
chunk_duration = chunk_size / sample_rate
sec_duration_to_capture = 2

total_chunks = int(sec_duration_to_capture * sample_rate / chunk_size)
collected_chunks = []
curr_window_chunk_count = 0

final_text = ""
previous_txt_seg = []
transcribed_bits = []
for audio_output in capture_audio_continuous(sample_rate, chunk_size):
        
    if curr_window_chunk_count < total_chunks:
        collected_chunks.append(audio_output)
        curr_window_chunk_count += 1
    
    elif curr_window_chunk_count == total_chunks:
        audio_snippet = np.concatenate(collected_chunks)
        segments, info = model.transcribe(audio_snippet, initial_prompt=final_text, word_timestamps=True, language="en")
        segs = [i for i in segments]
        transcribed_bits.append(segs)
        
        curr_chunk_words = []
        for seg in segs:
            curr_chunk_words.extend(seg.words)
        print("--"*10)
        print("previous_segment:", get_text_segment(previous_txt_seg))
        print("current_segment:", get_text_segment(curr_chunk_words))
        if previous_txt_seg != []:
            confirmed_txt_seg, st, end = longest_common_prefix_word_level(previous_txt_seg, curr_chunk_words)
            print("segment_overlap:", get_text_segment(confirmed_txt_seg))           
            if confirmed_txt_seg != []:
                final_text += get_text_segment(confirmed_txt_seg)
                previous_txt_seg = curr_chunk_words[len(confirmed_txt_seg):]
                
                num_chunks_to_remove = int(end / chunk_duration)
                collected_chunks = collected_chunks[num_chunks_to_remove:]
            else:
                previous_txt_seg = curr_chunk_words
        else:
            previous_txt_seg = curr_chunk_words

        curr_window_chunk_count = 0
    
        print("final_text:", final_text)
        print("--"*10) 
        

ALSA lib pcm_dsnoop.c:567:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1000:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2721:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2721:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2721:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_dmix.c:1000:(snd_pcm_dmix_open) unable to open slave
Cannot connect to server socket err = No such file or directory
Cannot connect to server request channel
jack server is not running or cannot be started
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock
JackShmReadWritePtr::~JackShmReadWritePtr - Init not done for -1, skipping unlock


--------------------
previous_segment: 
current_segment:  Thank you
final_text: 
--------------------
--------------------
previous_segment:  Thank you
current_segment:  When the world war broke out he joined the way.
segment_overlap: 
final_text: 
--------------------
--------------------
previous_segment:  When the world war broke out he joined the way.
current_segment:  When World War II broke out, he joined the Royal Navy, finishing it.
segment_overlap:  When
final_text:  When
--------------------
--------------------
previous_segment:  World War II broke out, he joined the Royal Navy, finishing it.
current_segment:  The World War broke up, he joined the Royal Navy, finishing his five-year stint as a lieutenant.
segment_overlap: 
final_text:  When
--------------------
--------------------
previous_segment:  The World War broke up, he joined the Royal Navy, finishing his five-year stint as a lieutenant.
current_segment:  The World War broke up. He joined the Royal Navy, finishing hi

KeyboardInterrupt: 

In [None]:
"When World War II broke He joined the Royal Navy finishing He has five-year stint as a You tend in command of a rocket vessel. He has seen battleships, submarines and aircraft and participated in He earned the reputation of men when they notice that in moments His facial muscles would contract Bye-bye!"

"When War II broke up, he tried to We may be finishing his five-year stint as a a attendant in command of the rocket vessel. He has seen any combat duty against battleships, submarines, and aircraft. participated in the eBay operations. He earned among his men when they Notice that in moments of stress during combat, his facial muscles would contract violently causing a broad"


In [4]:
chunk_1 = np.concatenate(chunks_lists[0])
print(len(chunk_1))
chunk_2 = np.concatenate(chunks_lists[1])
len(chunk_2)

79872


159744

In [7]:
transcribed_bits

[[Segment(id=1, seek=0, start=np.float64(0.0), end=np.float64(4.52), text=' William Golding was born in Cornwall, England in 1911.', tokens=[50364, 6740, 6731, 278, 390, 4232, 294, 21590, 16256, 11, 8196, 294, 1294, 5348, 13, 50614], avg_logprob=-0.19025735294117646, compression_ratio=0.9310344827586207, no_speech_prob=0.034027099609375, words=[Word(start=np.float64(0.0), end=np.float64(0.94), word=' William', probability=np.float64(0.71533203125)), Word(start=np.float64(0.94), end=np.float64(1.56), word=' Golding', probability=np.float64(0.53564453125)), Word(start=np.float64(1.56), end=np.float64(1.8), word=' was', probability=np.float64(0.96484375)), Word(start=np.float64(1.8), end=np.float64(2.06), word=' born', probability=np.float64(0.8974609375)), Word(start=np.float64(2.06), end=np.float64(2.3), word=' in', probability=np.float64(0.9326171875)), Word(start=np.float64(2.3), end=np.float64(2.96), word=' Cornwall,', probability=np.float64(0.742431640625)), Word(start=np.float64(2.

In [8]:
text_segs = []
for seg in transcribed_bits:
    x = " ".join([i.text for i in seg])
    text_segs.append(x)
    print(x)
    print("--"*20)

 William Golding was born in Cornwall, England in 1911.
----------------------------------------
 William Golding was born in Cornwall, England in 1911 and is a graduate of Oxford University.  He states that he was brought up to be assigned to
----------------------------------------
 and is a graduate of Oxford University. He states that he was brought up to be a scientist  or rebelled and after three years of Oxford changed his plans and
----------------------------------------
 a rebel and after three years of Oxford changed his plans and devoted himself to English literature.  When World War II broke out, he didn't do it.
----------------------------------------


In [7]:
seg_words = []
for seg in transcribed_bits:
    chunk_words = []
    for s in seg:
        seg_words.append(s.words)
        chunk_words.extend(s.words)
    break

In [8]:
chunk_words

[Word(start=np.float64(0.0), end=np.float64(0.52), word=' Hello,', probability=np.float64(0.9091796875)),
 Word(start=np.float64(1.18), end=np.float64(1.4), word=" I'm", probability=np.float64(0.836181640625)),
 Word(start=np.float64(1.4), end=np.float64(1.76), word=' doing', probability=np.float64(0.96337890625)),
 Word(start=np.float64(1.76), end=np.float64(2.18), word=' good.', probability=np.float64(0.92822265625)),
 Word(start=np.float64(2.38), end=np.float64(2.6), word=' My', probability=np.float64(0.9775390625)),
 Word(start=np.float64(2.6), end=np.float64(2.76), word=' name', probability=np.float64(0.9150390625)),
 Word(start=np.float64(2.76), end=np.float64(2.92), word=' is', probability=np.float64(0.9560546875)),
 Word(start=np.float64(2.92), end=np.float64(3.3), word=' William', probability=np.float64(0.900390625)),
 Word(start=np.float64(3.3), end=np.float64(4.02), word=' Colding', probability=np.float64(0.6488037109375)),
 Word(start=np.float64(4.02), end=np.float64(4.42),

In [44]:
def longest_common_prefix_word_level(word_list_1, word_list_2):
    # Find the minimum length of the two strings
    min_length = min(len(word_list_1), len(word_list_2))
    
    # Iterate through characters up to the minimum length
    for i in range(min_length):
        # If we find a mismatch, return the prefix up to this point
        if word_list_1[i].word != word_list_2[i].word:
            return word_list_1[:i], word_list_1[:i][0].start, word_list_1[:i][-1].end
    
    # If we reached the end of the loop without finding a mismatch,
    # the entire shorter string is a prefix of the longer one
    lcp = word_list_1[:min_length]
    return lcp, lcp[0].start. lcp[-1].end

In [34]:
print(seg_words[0])
print("".join([i.word for i in seg_words[0]]))
print(seg_words[1])
print("".join([i.word for i in seg_words[1]]))
com_pre = longest_common_prefix_word_level(seg_words[0], seg_words[1])
print(com_pre)
print("".join([i.word for i in com_pre]))

[Word(start=np.float64(0.0), end=np.float64(0.94), word=' William', probability=np.float64(0.71533203125)), Word(start=np.float64(0.94), end=np.float64(1.56), word=' Golding', probability=np.float64(0.53564453125)), Word(start=np.float64(1.56), end=np.float64(1.8), word=' was', probability=np.float64(0.96484375)), Word(start=np.float64(1.8), end=np.float64(2.06), word=' born', probability=np.float64(0.8974609375)), Word(start=np.float64(2.06), end=np.float64(2.3), word=' in', probability=np.float64(0.9326171875)), Word(start=np.float64(2.3), end=np.float64(2.96), word=' Cornwall,', probability=np.float64(0.742431640625)), Word(start=np.float64(2.96), end=np.float64(3.34), word=' England', probability=np.float64(0.9375)), Word(start=np.float64(3.34), end=np.float64(3.8), word=' in', probability=np.float64(0.75732421875)), Word(start=np.float64(3.8), end=np.float64(4.52), word=' 1911.', probability=np.float64(0.96826171875))]
 William Golding was born in Cornwall, England in 1911.
[Word(

In [58]:
segments_list = []
for seg in transcribed_bits:
    for i in seg:
        segments_list.append(i.words)

In [59]:
segments_list[0]

[Word(start=np.float64(0.0), end=np.float64(0.94), word=' William', probability=np.float64(0.71533203125)),
 Word(start=np.float64(0.94), end=np.float64(1.56), word=' Golding', probability=np.float64(0.53564453125)),
 Word(start=np.float64(1.56), end=np.float64(1.8), word=' was', probability=np.float64(0.96484375)),
 Word(start=np.float64(1.8), end=np.float64(2.06), word=' born', probability=np.float64(0.8974609375)),
 Word(start=np.float64(2.06), end=np.float64(2.3), word=' in', probability=np.float64(0.9326171875)),
 Word(start=np.float64(2.3), end=np.float64(2.96), word=' Cornwall,', probability=np.float64(0.742431640625)),
 Word(start=np.float64(2.96), end=np.float64(3.34), word=' England', probability=np.float64(0.9375)),
 Word(start=np.float64(3.34), end=np.float64(3.8), word=' in', probability=np.float64(0.75732421875)),
 Word(start=np.float64(3.8), end=np.float64(4.52), word=' 1911.', probability=np.float64(0.96826171875))]

In [None]:
final_text = ""
previous_txt_seg = []
for segment in segments_list:
    if previous_txt_seg != []:
        confirmed_txt_seg, start, end = longest_common_prefix_word_level(previous_txt_seg, segment)
        final_text += "".join([i.word for i in confirmed_txt_seg])
        previous_txt_seg = segment[len(confirmed_txt_seg):]
    else:
        previous_txt_seg = segment
    print(final_text)


 William Golding was born in Cornwall, England in


IndexError: list index out of range

In [50]:
for seg in transcribed_bits:
    seg_words = [i.words for i in seg]
    print(seg_words)

[[Word(start=np.float64(0.0), end=np.float64(0.94), word=' William', probability=np.float64(0.71533203125)), Word(start=np.float64(0.94), end=np.float64(1.56), word=' Golding', probability=np.float64(0.53564453125)), Word(start=np.float64(1.56), end=np.float64(1.8), word=' was', probability=np.float64(0.96484375)), Word(start=np.float64(1.8), end=np.float64(2.06), word=' born', probability=np.float64(0.8974609375)), Word(start=np.float64(2.06), end=np.float64(2.3), word=' in', probability=np.float64(0.9326171875)), Word(start=np.float64(2.3), end=np.float64(2.96), word=' Cornwall,', probability=np.float64(0.742431640625)), Word(start=np.float64(2.96), end=np.float64(3.34), word=' England', probability=np.float64(0.9375)), Word(start=np.float64(3.34), end=np.float64(3.8), word=' in', probability=np.float64(0.75732421875)), Word(start=np.float64(3.8), end=np.float64(4.52), word=' 1911.', probability=np.float64(0.96826171875))]]
[[Word(start=np.float64(0.0), end=np.float64(0.92), word=' W

In [51]:
seg_words

[[Word(start=np.float64(0.0), end=np.float64(0.24), word=' a', probability=np.float64(0.6787109375)),
  Word(start=np.float64(0.24), end=np.float64(0.72), word=' rebel', probability=np.float64(0.796875)),
  Word(start=np.float64(0.72), end=np.float64(1.3), word=' and', probability=np.float64(0.94677734375)),
  Word(start=np.float64(1.3), end=np.float64(1.6), word=' after', probability=np.float64(0.943359375)),
  Word(start=np.float64(1.6), end=np.float64(2.0), word=' three', probability=np.float64(0.814208984375)),
  Word(start=np.float64(2.0), end=np.float64(2.46), word=' years', probability=np.float64(0.93505859375)),
  Word(start=np.float64(2.46), end=np.float64(2.68), word=' of', probability=np.float64(0.81201171875)),
  Word(start=np.float64(2.68), end=np.float64(3.22), word=' Oxford', probability=np.float64(0.86962890625)),
  Word(start=np.float64(3.22), end=np.float64(4.24), word=' changed', probability=np.float64(0.88818359375)),
  Word(start=np.float64(4.24), end=np.float64(4.

In [15]:
segments_results = []
for segments, _ in results_list:
    segments = [i for i in segments]
    segments_results.append(segments)

In [22]:
transcribed_bits[0][0].tokens

[50364,
 2425,
 11,
 577,
 366,
 291,
 884,
 30,
 1222,
 1315,
 307,
 3780,
 293,
 220,
 3322,
 2146,
 393,
 2903,
 293,
 3191,
 309,
 13,
 50614]

In [17]:
from transformers import AutoTokenizer

stt_tokenizer = AutoTokenizer.from_pretrained("distil-whisper/distil-large-v2")

In [23]:
stt_tokenizer.convert_ids_to_tokens(transcribed_bits[0][0].tokens)

['<|0.00|>',
 'ĠHello',
 ',',
 'Ġhow',
 'Ġare',
 'Ġyou',
 'Ġdoing',
 '?',
 'ĠMy',
 'Ġname',
 'Ġis',
 'ula',
 'Ġand',
 'Ġ',
 'the',
 'Ġguy',
 'Ġcan',
 'Ġexplain',
 'Ġand',
 'Ġfix',
 'Ġit',
 '.',
 '<|5.00|>']

In [28]:
tokens_from_ids = stt_tokenizer.convert_ids_to_tokens(transcribed_bits[0][0].tokens)
string_from_toks = stt_tokenizer.convert_tokens_to_string(tokens_from_ids)
string_to_toks = stt_tokenizer.tokenize(string_from_toks)
print(tokens_from_ids, string_from_toks, string_to_toks)

['<|0.00|>', 'ĠHello', ',', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing', '?', 'ĠMy', 'Ġname', 'Ġis', 'ula', 'Ġand', 'Ġ', 'the', 'Ġguy', 'Ġcan', 'Ġexplain', 'Ġand', 'Ġfix', 'Ġit', '.', '<|5.00|>'] <|0.00|> Hello, how are you doing? My name isula and the guy can explain and fix it.<|5.00|> ['<|0.00|>', 'ĠHello', ',', 'Ġhow', 'Ġare', 'Ġyou', 'Ġdoing', '?', 'ĠMy', 'Ġname', 'Ġis', 'ula', 'Ġand', 'Ġ', 'the', 'Ġguy', 'Ġcan', 'Ġexplain', 'Ġand', 'Ġfix', 'Ġit', '.', '<|5.00|>']


In [29]:
tokens_from_ids

['<|0.00|>',
 'ĠHello',
 ',',
 'Ġhow',
 'Ġare',
 'Ġyou',
 'Ġdoing',
 '?',
 'ĠMy',
 'Ġname',
 'Ġis',
 'ula',
 'Ġand',
 'Ġ',
 'the',
 'Ġguy',
 'Ġcan',
 'Ġexplain',
 'Ġand',
 'Ġfix',
 'Ġit',
 '.',
 '<|5.00|>']

In [34]:
all_match = all(a == b for a, b in zip(tokens_from_ids, string_to_toks)) and len(tokens_from_ids) == len(string_to_toks)
all_match

True

In [14]:
for segments in transcribed_bits:
    segment_bits = []
    for seg in segments:
        print(seg)
        segment_bits.append(seg)

In [11]:
segment_bits

[]

In [18]:
def longest_common_prefix(str1, str2):
    # Find the minimum length of the two strings
    min_length = min(len(str1), len(str2))
    
    # Iterate through characters up to the minimum length
    for i in range(min_length):
        # If we find a mismatch, return the prefix up to this point
        if str1[i] != str2[i]:
            return str1[:i]
    
    # If we reached the end of the loop without finding a mismatch,
    # the entire shorter string is a prefix of the longer one
    return str1[:min_length]

In [24]:
final_text = ""
previous_txt_seg = ""
for segments in segments_results:
    seg_text = " ".join([i.text for i in segments])
    if previous_txt_seg != "":
        confirmed_txt_seg = longest_common_prefix(previous_txt_seg, seg_text)
        final_text += confirmed_txt_seg
        previous_txt_seg = seg_text[len(confirmed_txt_seg):]
    else:
        previous_txt_seg = seg_text
    print(final_text)


 William Golding was born in Cornwall, England in 1911
 William Golding was born in Cornwall, England in 1911 and is a graduate of Oxford University. 


KeyboardInterrupt: 

In [19]:
texts = ["William Golding was born in Cornwall, England in 1911.", "William Golding was born in Cornwall, England in 1911 and is a graduate of Oxford University.  He states that he was brought up to"]
longest_common_prefix(texts[0], texts[1])

'William Golding was born in Cornwall, England in 1911'