In [None]:
# Set environmental variables
# CMAKE_ARGS="-DLLAMA_CUBLAS=on"
import os
os.environ["CMAKE_ARGS"] = "-DLLAMA_CUBLAS=on"
print(os.getenv("CMAKE_ARGS"))

In [None]:
!pip install llama-cpp-python==0.2.34
!huggingface-cli download TheBloke/stablelm-zephyr-3b-GGUF stablelm-zephyr-3b.Q5_K_S.gguf --local-dir . --local-dir-use-symlinks False
!pip install -q git+https://github.com/openai/whisper.git
!pip install -q gradio
!pip install -q gTTS

In [None]:
import datetime
import os
# import time
from rich.console import Console
console = Console(width=110)

In [None]:
## Logger file
tstamp = datetime.datetime.now()
tstamp = str(tstamp).replace(' ','_')
logfile = f'{tstamp}_log.txt'
def writehistory(text):
    with open(logfile, 'a', encoding='utf-8') as f:
        f.write(text)
        f.write('\n')
    f.close()

In [None]:
## Load a llama-cpp-python quantized model
from llama_cpp import Llama
with console.status("Loading ✅✅✅✅ stablelm-zephyr-3b with LLAMA.CPP...",spinner="dots12"):
  llm_gpu = Llama(
    model_path="/content/stablelm-zephyr-3b.Q5_K_S.gguf",  # Download the model file first
    n_ctx=4096,  # The max sequence length to use - note that longer sequence lengths require much more resources
    n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
    n_gpu_layers=35         # The number of layers to offload to GPU, if you have GPU acceleration available
)
writehistory(f"{str(datetime.datetime.now())} Loaded 🧠 stablelm-zephyr-3b.Q5_K_S.gguf with GPU enabled")

In [None]:
# Simple inference example
prompt="In short response, what is the capital of France?"

template = f"<|user|>\n{prompt}<|endoftext|>\n<|assistant|>"

start = datetime.datetime.now()
output = llm_gpu(
    template, # Prompt
    # temperature=0.3,
    temperature=0,
    max_tokens=512,  # Generate up to 512 tokens
    stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
    echo=False        # Whether to echo the prompt
)
delta = datetime.datetime.now() - start
console.print(f"[bright_green bold on black]Question: {prompt}")
console.print(output['choices'][0]['text'])
console.print(f"Completed in: [bold red]{delta}")
writehistory(f"{str(datetime.datetime.now())} Inference completed in: {delta}")

In [None]:
import re

def llm_call(input_text):
    prompt = """Act as Tatianna, a junior-level assistant characterized by your cheerful demeanor and unwavering helpfulness. \
    Respond to this instruction or question, do not include information about yourself unless is part of the action or question: \
    """ + input_text

    template = f"<|user|>\n{prompt}<|endoftext|>\n<|assistant|>"

    start = datetime.datetime.now()
    response = llm_gpu(
        template, # Prompt
        temperature=0.1,
        max_tokens=200,  # Generate up to 512 tokens
        stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
        echo=False        # Whether to echo the prompt
    )

    delta = datetime.datetime.now() - start
    writehistory(f"{str(datetime.datetime.now())} Inference completed in: {delta}")
    # # Properly extract the response text
    # if response is not None:
    #     match = re.search(r'Tatiana:\s*(.*)', response['choices'][0]['text'])
    #     if match:
    #         # Extract the text after "ASSISTANT:"
    #         reply = match.group(1)
    #     else:
    #         reply = "No response found."
    # else:
    #     reply = "No response generated."

    # return reply

    if response is not None:
        reply = response['choices'][0]['text']
    else:
        reply = "No response generated."
    return reply

In [None]:
llm_call("Hello, good afternoon, you are new here, what is your name?")

In [None]:
import warnings
from gtts import gTTS
import numpy as np
import torch
warnings.filterwarnings("ignore")
torch.cuda.is_available()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using torch {torch.__version__} ({DEVICE})")

In [None]:
import whisper
model = whisper.load_model("medium", device=DEVICE)
print(
    f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
    f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)

In [None]:
def transcribe(audio):

    # Check if the audio input is None or empty
    if audio is None or audio == '':
        return ('','',None)  # Return empty strings and None audio file

    language = 'en'

    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)

    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)

    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    result_text = result.text

    out_result = llm_call(result_text)

    audioobj = gTTS(text = out_result,
                    lang = language,
                    slow = False)

    audioobj.save("Temp.mp3")

    return [result_text, out_result, "Temp.mp3"]

In [None]:
!ffmpeg -f lavfi -i anullsrc=r=44100:cl=mono -t 10 -q:a 9 -acodec libmp3lame Temp.mp3

In [None]:
import gradio as gr

output_1 = gr.Textbox(label="Speech to Text")
output_2 = gr.Textbox(label="ChatGPT Output")
output_3 = gr.Audio("Temp.mp3", autoplay=True)

gr.Interface(
    title = 'Learn OpenAI Whisper: Voice Assistant - Using the StableLM Zephyr 3B model',
    fn=transcribe,
    # gr.inputs.Audio(source="microphone", type="filepath")
    inputs = gr.Audio(sources=["microphone"], type="filepath"),
    outputs=[
        output_1,  output_2, output_3
    ],
    live=True).launch(debug=True)