<a href="https://colab.research.google.com/github/PmasCastro/llm_misc/blob/main/Text_to_speech_%2B_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create a transcript from an audio

This is a simple app that  takes an uploaded audio file and converts it into text using OpenAI’s Whisper automatic speech recognition (ASR) model.



In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece gradio accelerate openai httpx==0.27.2

In [None]:

## imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gradio as gr

In [None]:
## Constants

AUDIO_MODEL = "whisper-1"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
##Connect with Google Drive if needed

#drive.mount("/content/drive")
#audio_filename = "/content/drive/MyDrive/llms/[file]"

In [None]:
## Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
## Sign in to OpenAI using Secrets in Colab

openai_api_key = userdata.get('OPENAI_API_KEY')
openai = OpenAI(api_key=openai_api_key)

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    device_map="auto",
    quantization_config=quant_config
)

In [None]:

##This function generates a transcript from an uploaded audio file using OpenAI's model "whisper-1"
##Integrated with a Gradio UI, in this case with use "gr.File" so that we can upload a file directly from our computer
def generate_transcript(audio_file):
    with open(audio_file, "rb") as f: #context manager
        transcription = openai.audio.transcriptions.create(model=AUDIO_MODEL, file=f)
        return transcription

## Call the function and print the result
#print(generate_transcript(audio_filename))

In [None]:
def generate_summary(transcription, model, tokenizer):

  system_message = "You are an assistant that produces summaries from transcripts."
  user_prompt = f"Below is an audio transcription, write me a summary with key points and main takeaways.\n{transcription}"

  messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": user_prompt}
    ]
  inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
  outputs = model.generate(inputs, max_new_tokens=2000)
  response = tokenizer.decode(outputs[0])

  return response

In [None]:
def process_audio_file(audio_file):
    try:
        print("Starting transcription...")
        transcription = generate_transcript(audio_file)
        print("Transcription done.")

        print("Starting summary generation...")
        summary = generate_summary(transcription, model, tokenizer)
        print("Summary generated.")

        return summary

    except Exception as e:
        print("Error during processing:", str(e))
        return f"Error: {str(e)}"

In [None]:
gr.Interface(
    fn=generate_transcript,
    inputs=gr.File(type="filepath", label="Upload Audio File"), #with gr.File() we create a file component that allows uploading one or more generic files (when used as an input) or displaying generic files or URLs for download (as output).
    outputs=gr.Textbox(label="Summary"),
    title="SummarizerBot"
).launch(inbrowser=True)

## Free option


In [None]:
speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(AUDIO_MODEL, torch_dtype=torch.float16, low_cpu_mem_usage=True, use_safetensors=True)
speech_model.to('cuda')
processor = AutoProcessor.from_pretrained(AUDIO_MODEL)

pipe = pipeline(
    "automatic-speech-recognition",
    model=speech_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch.float16,
    device='cuda',
)

In [None]:
# Use the Whisper OpenAI model to convert the Audio to Text
result = pipe(audio_filename)

In [None]:
transcription = result["text"]
print(transcription)

#Things to implement in the future:

-implement the free version of speech to text;

-clean the output to only display the actual summary;

-maybe add a function that generates a pdf file with the summary
