In [14]:
! nvidia-smi

Thu Mar 14 00:01:43 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.01              Driver Version: 536.67       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4070        On  | 00000000:24:00.0  On |                  N/A |
|  0%   36C    P8               9W / 200W |   2583MiB / 12282MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [15]:
# ! mkdir soundbites

In [16]:
# ! pip install ipywidgets

In [17]:
import torch
import numpy as np
import gradio as gr
import IPython

from TTS.api import TTS
from pprint import pprint
from IPython.display import Audio
from langchain_community.llms import Ollama
from optimum.bettertransformer import BetterTransformer
from langchain.callbacks.manager import CallbackManager
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackHandler

In [18]:
import pygame
pygame.mixer.init()

In [19]:
tts = TTS("tts_models/en/vctk/vits").to('cuda')

 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.


In [20]:
# wav = tts.tts(text="I'm just an AI, I don't have feelings or emotions like humans do, so I can't experience the world in the same way that you do. However, I'm here to help answer any questions you may have, provide information, and assist you in any way I can. Is there something specific you would like to know or discuss?", speaker=tts.speakers[2]) #, speaker_wav="audio/scarlett_johanson.wav", language="en")
# Audio(np.array(wav), rate=20000, autoplay=True)

In [21]:
# llm = Ollama( model="openhermes", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]), )

In [22]:
class MyStreamingCallback(BaseCallbackHandler):
    def __init__(self):
        self.content = ""
        
    def on_llm_new_token(self, token: str, **kwargs):
        self.content += token
        if self.content.strip()[-1] in ['.', '?', '!', ':']:
            wav = tts.tts(self.content, speaker=tts.speakers[2], verbose=False)
            IPython.display.display(Audio(np.array(wav), rate=20000))
            print(self.content)
            self.content = ""
            sleep(len(wav) / 20000)

handler = MyStreamingCallback()

In [23]:
class MyStreamingCallback(BaseCallbackHandler):
    def __init__(self):
        self.content = ""
        
    def on_llm_new_token(self, token: str, **kwargs):
        self.content += token
        if self.content.strip()[-1] in ['.', '?', '!', ':']:
            wav = tts.tts(self.content, speaker=tts.speakers[2], verbose=False)
            filename = "soundbites/"+ self.content[:12] + ".wav"
            tts.tts_to_file(text=self.content, speaker=tts.speakers[2], file_path=filename)
            
            if pygame.mixer.music.get_busy() == True:
                pygame.mixer.music.queue(filename)
            else:
                pygame.mixer.music.load(filename)
                pygame.mixer.music.play()
            print(self.content)
            self.content = ""

handler = MyStreamingCallback()

In [24]:
filename = "soundbites/ However, I'.wav"

In [25]:
pygame.mixer.music.get_busy()

False

In [26]:
pygame.mixer.music.load(filename)
pygame.mixer.music.play()

In [27]:
llm2 = Ollama(
    model="openhermes",
    callbacks=[handler],
)

In [28]:
llm2.invoke("Hey how are you doing?")

 > Text splitted to sentences.
["I'm doing well, thank you!"]
 > Processing time: 0.989565372467041
 > Real-time factor: 0.5999757056450246
 > Text splitted to sentences.
["I'm doing well, thank you!"]
 > Processing time: 0.27623534202575684
 > Real-time factor: 0.17233446388829612
I'm doing well, thank you!
 > Text splitted to sentences.
['How about you?']




 > Processing time: 0.32928037643432617
 > Real-time factor: 0.2699521230062794
 > Text splitted to sentences.
['How about you?']
 > Processing time: 0.1315593719482422
 > Real-time factor: 0.10994861095583461
 How about you?


"I'm doing well, thank you! How about you?"

In [29]:
# llm("How are you feeling today?")

In [30]:
whisper_model_id = "distil-whisper/distil-medium.en"

whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
    whisper_model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True# , use_flash_attention_2=True
)
whisper.to("cuda")
# model = model.to_bettertransformer() # we are using optimum BetterTransformer since Flash Attention 2 isn't supported on Colab
processor = AutoProcessor.from_pretrained(whisper_model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [31]:
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15, #long form transcription
    batch_size=16,
    torch_dtype=torch.float16,
    device='cuda',
)

In [None]:
import gradio as gr
import numpy as np

def transcribe(filepath):
    output = asr_pipe(
        filepath,
    )
    return output["text"]


def transcribe_streaming(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    transcribed_stream = asr_pipe({"sampling_rate": sr, "raw": stream})["text"]
    if transcribed_stream.strip().endswith('?') and len(transcribed_stream) > 30:
        for w in llm(transcribed_stream):
            self.content += transcribed_stream
            wav = tts.tts(self.content, speaker=tts.speakers[2], verbose=False)
            filename = "soundbites/"+ self.content[:12] + ".wav"
            tts.tts_to_file(text=self.content, speaker=tts.speakers[2], file_path=filename)
            
            if pygame.mixer.music.get_busy() == True:
                pygame.mixer.music.queue(filename)
            else:
                pygame.mixer.music.load(filename)
                pygame.mixer.music.play()
            print(self.content)
            self.content = ""
    
    return stream, transcribed_stream

demo = gr.Blocks()

mic_transcribe = gr.Interface(
    title='My Audio Transcription App Powered by Distill Whisper',
    description="Start recording",
    fn=transcribe_streaming,
    inputs=["state", gr.Audio(sources="microphone", streaming=True)],
    outputs=["state", "text"],
    live=True,
)


file_transcribe = gr.Interface(
    title='My Audio Transcription App Powered by Distill Whisper',
    description="Upload an audio file",
    fn=transcribe,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(),
)


gr.close_all()

with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone",  "Transcribe Audio File", ],
    )

demo.launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Traceback (most recent call last):
  File "/home/ismail/envs/hym/lib/python3.10/site-packages/gradio/queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
  File "/home/ismail/envs/hym/lib/python3.10/site-packages/gradio/route_utils.py", line 232, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/ismail/envs/hym/lib/python3.10/site-packages/gradio/blocks.py", line 1561, in process_api
    result = await self.call_function(
  File "/home/ismail/envs/hym/lib/python3.10/site-packages/gradio/blocks.py", line 1179, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/home/ismail/envs/hym/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/home/ismail/envs/hym/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2134, in run_sync_in_worker_thread
    return await future
  File "/home/ismail/envs/hym

In [None]:
torch.float16