# Uses Frontier OpenAI to transcribe audio to text and then summarize that audio for users to review using Hugging Face Open Source Models on google colab.
# I.E Speach-To-Text Summarization

Utilizes:
- Hugging Face Open Source Models and pipelines
- OpenAi Speech to text transcription
- Tokenizer
- Quantization
- Gradio Audio Interface with File upload

In [None]:
# Install
!pip install -q --upgrade bitsandbytes accelerate transformers==4.57.6
!pip install -q gradio torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124

In [None]:
# Imports

import os
import requests
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
from transformers import pipeline
import gradio as gr
import torch

In [None]:
# initialize api

load_dotenv(override=True)

openai_api_key = os.getenv("OPENAI_API_KEY")
hf_token = os.getenv("HF_TOKEN")

if openai_api_key:
    print(f"OpenAI API Key exists")
else:
    print("OpenAI API Key not set")
if hf_token:
    print(f"HF Token exists")
else:
    print("HF Token not set")

LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
openai = OpenAI()

In [None]:
# Sign in to HuggingFace Hub

login(hf_token, add_to_git_credential=True)


In [None]:
# Quantization

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# prompt fucntion

def get_prompts(transcription):
    system_prompt = """
    You produce minutes of meetings from transcripts, with summary, key discussion points,
    takeaways and action items with owners, in markdown format without code blocks.
    """

    user_prompt = f"""
    Below is an extract transcript of a Denver council meeting.
    Please write minutes in markdown without code blocks, including:
    - a summary with attendees, location and date
    - discussion points
    - takeaways
    - action items with owners

    Transcription:
    {transcription}
    """
    messages = [{"role": "system", "content": system_prompt},
              {"role": "user", "content": user_prompt}]

    return messages


In [None]:
# transcribe function

def transcribe_audio(audio_file_path):

  #Use of Hugging Face PipeLine to transcribe using openAI opensource model
#   # Define and call pipeline
#   pipe = pipeline(
#     "automatic-speech-recognition",
#     model="openai/whisper-medium.en",
#     dtype=torch.float16,
#     device='cuda',
#     return_timestamps=True
# )

#   result = pipe(audio_input)
#   open_source_transcription = result["text"]

  # Open File
  audio_file= open(audio_file_path, "rb")

  # Frontier model transcription
  AUDIO_MODEL = "gpt-4o-mini-transcribe"

  frontier_transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=audio_file, response_format="text")

  # Transcribe Text 
  tokenizer = AutoTokenizer.from_pretrained(LLAMA)
  tokenizer.pad_token = tokenizer.eos_token
  inputs = tokenizer.apply_chat_templates(get_prompts(frontier_transcription), return_tensors="pt").to("cuda")
  streamer = TextStreamer(tokenizer)
  model = AutoModelForCausalLM.from_pretrained(LLAMA, device_map="auto", quantization_config=quant_config)
  outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)
  response = tokenizer.decode(outputs[0])

  return response

In [None]:
# Create Gradio UI

message_output = gr.Markdown(label="Summary")

view = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath"),
    outputs=[message_output],
    flagging_mode="never",
)

view.launch(share=False)
