In [1]:
import torch
import numpy as np
import gradio as gr
import IPython

from TTS.api import TTS
from pprint import pprint
from IPython.display import Audio
from langchain_community.llms import Ollama
from optimum.bettertransformer import BetterTransformer
from langchain.callbacks.manager import CallbackManager
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackHandler
import pygame

pygame 2.5.2 (SDL 2.28.2, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
pygame.mixer.init()

In [3]:
tts = TTS("tts_models/en/vctk/vits").to('cuda')

 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.


In [4]:
class AudioPlaybackCallback(BaseCallbackHandler):
    def __init__(self):
        self.content = ""
        
    def on_llm_new_token(self, token: str, **kwargs):
        self.content += token
        if self.content.strip()[-1] in ['.', '?', '!', ':']:
            wav = tts.tts(self.content, speaker=tts.speakers[2], verbose=False)
            filename = "soundbites/"+ self.content[:12] + ".wav"
            tts.tts_to_file(text=self.content, speaker=tts.speakers[2], file_path=filename)
            
            if pygame.mixer.music.get_busy() == True:
                pygame.mixer.music.queue(filename)
            else:
                pygame.mixer.music.load(filename)
                pygame.mixer.music.play()
            print(self.content)
            self.content = ""

handler = AudioPlaybackCallback()

In [5]:
llm = Ollama(
    model="llama2",
    callbacks=[handler],
)

In [6]:
whisper_model_id = "distil-whisper/distil-medium.en"

whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
    whisper_model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True# , use_flash_attention_2=True
)
whisper.to("cuda")
# model = model.to_bettertransformer() # we are using optimum BetterTransformer since Flash Attention 2 isn't supported on Colab
processor = AutoProcessor.from_pretrained(whisper_model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15, #long form transcription
    batch_size=16,
    torch_dtype=torch.float16,
    device='cuda',
)

In [8]:
def transcribe_streaming(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    return stream, pipe({"sampling_rate": sr, "raw": stream})["text"]


In [13]:
def transcribe_streaming(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    transcribed_stream = asr_pipe({"sampling_rate": sr, "raw": stream})["text"]
    return stream, transcribed_stream

def answer_questions(stream):
    if stream.strip().endswith('?'):
        llm(stream)
    else:
        llm("Say: Sorry, I can't help you with that")

with gr.Blocks() as demo:
    textbox = gr.Textbox()
    mic_transcribe = gr.Interface(
        title='My Audio Transcription App Powered by Distill Whisper',
        description="Start recording",
        fn=transcribe_streaming,
        inputs=["state", gr.Audio(sources="microphone", streaming=True)],
        outputs=["state", textbox],
        live=True,
    )
    button = gr.Button("Answer")
    button.click(answer_questions, textbox, None)

demo.launch()

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.






 > Text splitted to sentences.
["The sky appears blue because of a phenomenon called Rayleigh scattering, which occurs when light travels through the Earth's atmosphere."]
 > Processing time: 0.1654205322265625
 > Real-time factor: 0.028266605204554426
 > Text splitted to sentences.
["The sky appears blue because of a phenomenon called Rayleigh scattering, which occurs when light travels through the Earth's atmosphere."]
 > Processing time: 0.17553949356079102
 > Real-time factor: 0.030602829166788758
 The sky appears blue because of a phenomenon called Rayleigh scattering, which occurs when light travels through the Earth's atmosphere.
 > Text splitted to sentences.
['Blue light has shorter wavelengths than other colors, so it scatters more easily and is visible from a greater distance.']
 > Processing time: 0.18590044975280762
 > Real-time factor: 0.03208441544340488
 > Text splitted to sentences.
['Blue light has shorter wavelengths than other colors, so it scatters more easily and 



 > Processing time: 0.17913603782653809
 > Real-time factor: 0.026016635275550405
 This is why the sky appears blue during the daytime, when the sunlight is scattered in all directions by the tiny molecules of gases in the atmosphere.
 > Text splitted to sentences.
['The reason why the sky appears blue at sunrise and sunset is due to a different phenomenon called Mie scattering.']
 > Processing time: 0.1715083122253418
 > Real-time factor: 0.03282320410853342
 > Text splitted to sentences.
['The reason why the sky appears blue at sunrise and sunset is due to a different phenomenon called Mie scattering.']
 > Processing time: 0.1668863296508789
 > Real-time factor: 0.0325903674436896


The reason why the sky appears blue at sunrise and sunset is due to a different phenomenon called Mie scattering.
 > Text splitted to sentences.
['This occurs when light travels through a layer of atmosphere that contains particles, such as dust or water droplets, that are larger than the wavelength of th



 > Processing time: 0.20679450035095215
 > Real-time factor: 0.026503177791886535
 In this case, the light scatters in all directions, but it appears more red and orange because these colors have longer wavelengths and are scattered less than blue and violet light.
 > Text splitted to sentences.
["It's also worth noting that the color of the sky can be affected by other factors, such as pollution, dust, and smoke, which can scatter light in different ways and change the appearance of the sky."]
 > Processing time: 0.24992728233337402
 > Real-time factor: 0.028064127431408868
 > Text splitted to sentences.
["It's also worth noting that the color of the sky can be affected by other factors, such as pollution, dust, and smoke, which can scatter light in different ways and change the appearance of the sky."]
 > Processing time: 0.2117013931274414
 > Real-time factor: 0.023865111035072002


It's also worth noting that the color of the sky can be affected by other factors, such as pollution,



 > Processing time: 0.1595158576965332
 > Real-time factor: 0.03624613213322916
 However, under normal conditions, the blue color of the sky is due to Rayleigh scattering.




 > Text splitted to sentences.
['There are many new and exciting developments under the sun, and here are a few examples:']




 > Processing time: 0.2076575756072998
 > Real-time factor: 0.04872775351333391
 > Text splitted to sentences.
['There are many new and exciting developments under the sun, and here are a few examples:']
 > Processing time: 0.18135499954223633
 > Real-time factor: 0.043262914790400626

There are many new and exciting developments under the sun, and here are a few examples:
 > Text splitted to sentences.
['1.']
 > Processing time: 0.1886143684387207
 > Real-time factor: 0.18876846514496148
 > Text splitted to sentences.
['1.']
 > Processing time: 0.25141239166259766
 > Real-time factor: 0.25760423959852596


1.
 > Text splitted to sentences.
['Space Exploration:']
 > Processing time: 0.21174073219299316
 > Real-time factor: 0.1265966145568194
 > Text splitted to sentences.
['Space Exploration:']
 > Processing time: 0.2980656623840332
 > Real-time factor: 0.18595370800045077
 Space Exploration:
 > Text splitted to sentences.
['With the ongoing space race between nations, there have been 



 Private companies like SpaceX and Blue Origin are leading the charge in developing reusable rockets for space travel, while NASA continues to push the boundaries of space exploration with its Artemis program.
 > Text splitted to sentences.
['2.']
 > Processing time: 0.20374655723571777
 > Real-time factor: 0.20631023085266242
 > Text splitted to sentences.
['2.']
 > Processing time: 0.15029454231262207
 > Real-time factor: 0.1658323988187208

2.
 > Text splitted to sentences.
['Artificial Intelligence:']
 > Processing time: 0.21162843704223633
 > Real-time factor: 0.10781901656149055
 > Text splitted to sentences.
['Artificial Intelligence:']
 > Processing time: 0.14447355270385742
 > Real-time factor: 0.07973673000400622
 Artificial Intelligence:
 > Text splitted to sentences.
['AI has made tremendous progress in recent years, with applications in various fields such as healthcare, finance, and transportation.']
 > Processing time: 0.20152735710144043
 > Real-time factor: 0.028221714



 Machine learning algorithms are becoming more sophisticated, enabling self-driving cars, personalized medicine, and efficient supply chain management.
 > Text splitted to sentences.
['3.']
 > Processing time: 0.18009114265441895
 > Real-time factor: 0.17034187094757797
 > Text splitted to sentences.
['3.']
 > Processing time: 0.0647573471069336
 > Real-time factor: 0.06557216677571114

3.
 > Text splitted to sentences.
['Quantum Computing:']
 > Processing time: 0.17195558547973633
 > Real-time factor: 0.11842893115405378
 > Text splitted to sentences.
['Quantum Computing:']
 > Processing time: 0.21113181114196777
 > Real-time factor: 0.1489841409267918
 Quantum Computing:
 > Text splitted to sentences.
['Quantum computing is a rapidly growing field that has the potential to revolutionize computing as we know it.']
 > Processing time: 0.21406006813049316
 > Real-time factor: 0.043376199293094525
 > Text splitted to sentences.
['Quantum computing is a rapidly growing field that has the 



 These computers can perform certain calculations much faster than classical computers, which could lead to breakthroughs in fields such as cryptography, drug discovery, and climate modeling.
 > Text splitted to sentences.
['4.']
 > Processing time: 0.11007142066955566
 > Real-time factor: 0.11278228744255123
 > Text splitted to sentences.
['4.']




 > Processing time: 0.19362854957580566
 > Real-time factor: 0.20323255512883256

4.
 > Text splitted to sentences.
['5G Networks:']
 > Processing time: 0.10798883438110352
 > Real-time factor: 0.0743738692561011
 > Text splitted to sentences.
['5G Networks:']
 > Processing time: 0.20234251022338867
 > Real-time factor: 0.13825149821596802
 5G Networks:
 > Text splitted to sentences.
['The rollout of 5G networks is underway, providing faster data transfer speeds and lower latency compared to previous generations of wireless technology.']
 > Processing time: 0.22819733619689941
 > Real-time factor: 0.03185297821800385
 > Text splitted to sentences.
['The rollout of 5G networks is underway, providing faster data transfer speeds and lower latency compared to previous generations of wireless technology.']
 > Processing time: 0.18493175506591797
 > Real-time factor: 0.024655032886738724
 The rollout of 5G networks is underway, providing faster data transfer speeds and lower latency compared



 > Processing time: 0.23807001113891602
 > Real-time factor: 0.04244924752242446
 > Text splitted to sentences.
['This will enable new use cases such as remote healthcare services, autonomous vehicles, and smart cities.']
 > Processing time: 0.1405503749847412
 > Real-time factor: 0.0250609374467391
 This will enable new use cases such as remote healthcare services, autonomous vehicles, and smart cities.
 > Text splitted to sentences.
['5.']
 > Processing time: 0.21460914611816406
 > Real-time factor: 0.19651709600936537
 > Text splitted to sentences.
['5.']
 > Processing time: 0.3343501091003418
 > Real-time factor: 0.31976144629001285

5.
 > Text splitted to sentences.
['Electric Vehicles:']
 > Processing time: 0.2883138656616211
 > Real-time factor: 0.20179408131788806
 > Text splitted to sentences.
['Electric Vehicles:']
 > Processing time: 0.23966455459594727
 > Real-time factor: 0.15398028638813044
 Electric Vehicles:
 > Text splitted to sentences.
['Electric vehicles (EVs) are b



 > Processing time: 0.2247776985168457
 > Real-time factor: 0.03722770890139743
 As battery technology improves, EVs are set to become even more viable alternatives to traditional gasoline-powered vehicles.
 > Text splitted to sentences.
['6.']
 > Processing time: 0.12488913536071777
 > Real-time factor: 0.1143606908099596
 > Text splitted to sentences.
['6.']
 > Processing time: 0.17551350593566895
 > Real-time factor: 0.15260539455368693

6.
 > Text splitted to sentences.
['Virtual and Augmented Reality:']
 > Processing time: 0.21667838096618652
 > Real-time factor: 0.11171339086009197
 > Text splitted to sentences.
['Virtual and Augmented Reality:']
 > Processing time: 0.2069416046142578
 > Real-time factor: 0.09844369998585573
 Virtual and Augmented Reality:
 > Text splitted to sentences.
['VR and AR technologies are advancing rapidly, enabling new forms of entertainment, education, and communication.']
 > Processing time: 0.2578706741333008
 > Real-time factor: 0.04198204640164857



 From gaming to remote work, these technologies have the potential to transform various aspects of our lives.
 > Text splitted to sentences.
['7.']
 > Processing time: 0.24582338333129883
 > Real-time factor: 0.22273198563671678
 > Text splitted to sentences.
['7.']
 > Processing time: 0.22838139533996582
 > Real-time factor: 0.20477430738639585

7.
 > Text splitted to sentences.
['Blockchain:']
 > Processing time: 0.24328064918518066
 > Real-time factor: 0.20941358192275272
 > Text splitted to sentences.
['Blockchain:']
 > Processing time: 0.21509075164794922
 > Real-time factor: 0.1730425815031115
 Blockchain:
 > Text splitted to sentences.
['The blockchain technology behind cryptocurrencies like Bitcoin has broader applications beyond digital currency.']
 > Processing time: 0.17702221870422363
 > Real-time factor: 0.03441370364674259
 > Text splitted to sentences.
['The blockchain technology behind cryptocurrencies like Bitcoin has broader applications beyond digital currency.']
 > 



 > Processing time: 0.2314145565032959
 > Real-time factor: 0.03527465829898293
 It can securely store and transfer data, enabling new use cases such as decentralized finance (DeFi) and non-fungible tokens (NFTs).
 > Text splitted to sentences.
['8.']
 > Processing time: 0.14329004287719727
 > Real-time factor: 0.1450930127407329
 > Text splitted to sentences.
['8.']
 > Processing time: 0.22840332984924316
 > Real-time factor: 0.22089006241999173

8.
 > Text splitted to sentences.
['Synthetic Biology:']
 > Processing time: 0.22388648986816406
 > Real-time factor: 0.14713570283717864
 > Text splitted to sentences.
['Synthetic Biology:']
 > Processing time: 0.20339536666870117
 > Real-time factor: 0.12507998201263
 Synthetic Biology:
 > Text splitted to sentences.
['This field involves the design and construction of new biological systems, such as microbes that can produce biofuels or clean up environmental pollutants.']
 > Processing time: 0.19318699836730957
 > Real-time factor: 0.0250



 > Processing time: 0.19899535179138184
 > Real-time factor: 0.0360795249555977
 > Text splitted to sentences.
['Synthetic biology has the potential to revolutionize various industries, from agriculture to pharmaceuticals.']
 > Processing time: 0.1881577968597412
 > Real-time factor: 0.03341128254056576
 Synthetic biology has the potential to revolutionize various industries, from agriculture to pharmaceuticals.
 > Text splitted to sentences.
['9.']
 > Processing time: 0.13103246688842773
 > Real-time factor: 0.13113951955745423
 > Text splitted to sentences.
['9.']
 > Processing time: 0.1121530532836914
 > Real-time factor: 0.1122446815951977

9.
 > Text splitted to sentences.
['Robotics:']
 > Processing time: 0.1190803050994873
 > Real-time factor: 0.1035378835742782
 > Text splitted to sentences.
['Robotics:']
 > Processing time: 0.1753709316253662
 > Real-time factor: 0.13360036768723482
 Robotics:
 > Text splitted to sentences.
['Advances in robotics are enabling the development o



 > Processing time: 0.1973130702972412
 > Real-time factor: 0.036387270842149814
 Robots are also being used in logistics and transportation, improving efficiency and reducing costs.
 > Text splitted to sentences.
['10.']
 > Processing time: 0.19170331954956055
 > Real-time factor: 0.18965623636341575
 > Text splitted to sentences.
['10.']
 > Processing time: 0.17923474311828613
 > Real-time factor: 0.18585995512406928

10.
 > Text splitted to sentences.
['Nanotechnology:']
 > Processing time: 0.11943244934082031
 > Real-time factor: 0.08427692997840143
 > Text splitted to sentences.
['Nanotechnology:']
 > Processing time: 0.18063855171203613
 > Real-time factor: 0.13178533831559014
 Nanotechnology:
 > Text splitted to sentences.
['Researchers are making breakthroughs in nanotechnology, enabling the development of new materials with unique properties.']
 > Processing time: 0.19725942611694336
 > Real-time factor: 0.03509982525725146
 > Text splitted to sentences.
['Researchers are maki



 These materials have potential applications in fields such as medicine, energy, and electronics.
 > Text splitted to sentences.
['These are just a few examples of the exciting developments underway in various fields.']
 > Processing time: 0.212266206741333
 > Real-time factor: 0.053293744974567236
 > Text splitted to sentences.
['These are just a few examples of the exciting developments underway in various fields.']
 > Processing time: 0.19959306716918945
 > Real-time factor: 0.0489699476041551


These are just a few examples of the exciting developments underway in various fields.
 > Text splitted to sentences.
['As technology continues to evolve at an unprecedented pace, we can expect even more innovative solutions to emerge in the future.']
 > Processing time: 0.21941041946411133
 > Real-time factor: 0.03326823460490466
 > Text splitted to sentences.
['As technology continues to evolve at an unprecedented pace, we can expect even more innovative solutions to emerge in the future.'



 > Processing time: 0.22778868675231934
 > Real-time factor: 0.0347835217651568
 As technology continues to evolve at an unprecedented pace, we can expect even more innovative solutions to emerge in the future.
