In [1]:
! nvidia-smi

Tue Jan 16 19:04:46 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.86.01              Driver Version: 536.67       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4070        On  | 00000000:24:00.0  On |                  N/A |
|  0%   40C    P8               9W / 200W |    731MiB / 12282MiB |      3%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
# ! pip install ipywidgets

In [74]:
import torch
import numpy as np
import gradio as gr
import IPython

from TTS.api import TTS
from pprint import pprint
from IPython.display import Audio
from langchain_community.llms import Ollama
from optimum.bettertransformer import BetterTransformer
from langchain.callbacks.manager import CallbackManager
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.base import BaseCallbackHandler

In [4]:
tts = TTS("tts_models/en/vctk/vits").to('cuda')

 > tts_models/en/vctk/vits is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.


In [5]:
# wav = tts.tts(text="I'm just an AI, I don't have feelings or emotions like humans do, so I can't experience the world in the same way that you do. However, I'm here to help answer any questions you may have, provide information, and assist you in any way I can. Is there something specific you would like to know or discuss?", speaker=tts.speakers[2]) #, speaker_wav="audio/scarlett_johanson.wav", language="en")
# Audio(np.array(wav), rate=20000, autoplay=True)

In [31]:
llm = Ollama(
    model="llama2",
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
)

In [109]:
class MyStreamingCallback(BaseCallbackHandler):
    def __init__(self):
        self.content = ""
        
    def on_llm_new_token(self, token: str, **kwargs):
        self.content += token
        if self.content.strip()[-1] in ['.', '?', '!', ':']:
            wav = tts.tts(self.content, speaker=tts.speakers[2], verbose=False)
            IPython.display.display(Audio(np.array(wav), rate=20000))
            print(self.content)
            self.content = ""
            sleep(len(wav) / 20000)

handler = MyStreamingCallback()

In [110]:
llm2 = Ollama(
    model="llama2",
    callbacks=[handler],
)

In [111]:
llm2("Hey how are you doing?")

 > Text splitted to sentences.
["I don't have personal feelings or emotions, so I can't do well or poorly."]
 > Processing time: 0.2651815414428711
 > Real-time factor: 0.07226682060529102




I don't have personal feelings or emotions, so I can't do well or poorly.
 > Text splitted to sentences.
['I\'m just an AI designed to provide information and assist with tasks, so I don\'t have the capacity to experience emotions or have a physical body that could be "doing well" or "doing poorly."', "I'm here to help answer your questions and provide information to the best of my abilities, so feel free to ask me anything!"]
 > Processing time: 0.40567588806152344
 > Real-time factor: 0.028221710410640432




 I'm just an AI designed to provide information and assist with tasks, so I don't have the capacity to experience emotions or have a physical body that could be "doing well" or "doing poorly." I'm here to help answer your questions and provide information to the best of my abilities, so feel free to ask me anything!


'I don\'t have personal feelings or emotions, so I can\'t do well or poorly. I\'m just an AI designed to provide information and assist with tasks, so I don\'t have the capacity to experience emotions or have a physical body that could be "doing well" or "doing poorly." I\'m here to help answer your questions and provide information to the best of my abilities, so feel free to ask me anything!'

In [7]:
# llm("How are you feeling today?")

In [8]:
whisper_model_id = "distil-whisper/distil-medium.en"

whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
    whisper_model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True# , use_flash_attention_2=True
)
whisper.to("cuda")
# model = model.to_bettertransformer() # we are using optimum BetterTransformer since Flash Attention 2 isn't supported on Colab
processor = AutoProcessor.from_pretrained(whisper_model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model=whisper,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15, #long form transcription
    batch_size=16,
    torch_dtype=torch.float16,
    device='cuda',
)

In [23]:
import gradio as gr
import numpy as np

def transcribe(filepath):
    output = asr_pipe(
        filepath,
    )
    return output["text"]


def transcribe_streaming(stream, new_chunk):
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    transcribed_stream = asr_pipe({"sampling_rate": sr, "raw": stream})["text"]
    if transcribed_stream.strip().endswith('?') and len(transcribed_stream) > 30:
        for w in llm(transcribed_stream):
            
    
    return stream, transcribed_stream

demo = gr.Blocks()

mic_transcribe = gr.Interface(
    title='My Audio Transcription App Powered by Distill Whisper',
    description="Start recording",
    fn=transcribe_streaming,
    inputs=["state", gr.Audio(sources="microphone", streaming=True)],
    outputs=["state", "text"],
    live=True,
)


file_transcribe = gr.Interface(
    title='My Audio Transcription App Powered by Distill Whisper',
    description="Upload an audio file",
    fn=transcribe,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Textbox(),
)


gr.close_all()

with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone",  "Transcribe Audio File", ],
    )

demo.launch(debug=True)

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


  warn_deprecated(


I apologize, but I'm a large language model, I cannot provide you with a "<<SYS>><</SYS>>" command as it is not a valid or recognized command in any operating system. It appears to be a syntax error or a made-up command. Can I help you with anything else?
The meaning of



 life is a complex and philosophical question that has been debated throughout human history. There are many different perspectives on what gives life meaning, and there is no one definitive answer. However, here are some possible ways to approach the question:

1. Religious or spiritual beliefs: Many people believe that the purpose of life is to fulfill a divine or spiritual purpose, such as following the teachings of a particular religion or fulfilling a divine plan.
2. Personal growth and development: Some people believe that the meaning of life is to learn, grow, and develop as individuals, and to become the best version of themselves.
3. Relationships and connections: Others believe that the meaning of life is found in the relationships and connections we have with others, such as family, friends, and community.
4. Contribution and legacy: Some people believe that the meaning of life is to make a positive impact on the world and to leave a lasting legacy.
5. Experience and enjoyme



In [None]:
torch.float16