<a href="https://colab.research.google.com/github/Rhuan-Messias/LLM_RAG_Study/blob/main/multimodal_LLM_study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Transcription Step

In [None]:
!pip install -q --upgrade bitsandbytes accelerate

In [None]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [None]:
LLAMA = 'meta-llama/Llama-3.2-3B-Instruct'

In [None]:
audio_filename = "/content/denver_extract.mp3"
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)


In [None]:
audio_file = open(audio_filename,'rb')

In [None]:
from transformers import AutoModelForSpeechSeq2Seq
from transformers import pipeline

pipe = pipeline(
    "automatic-speech-recognition",
    model = 'openai/whisper-medium.en',
    torch_dtype=torch.float16,
    device_map='auto'
)

result = pipe(audio_filename, return_timestamps=True)
transcription = result['text']
print(transcription)

In [None]:
open_source_transcription = transcription
display(Markdown(open_source_transcription))

## Analyzing and Reporting Step

In [None]:
system_message = """
You produce minutes of meetings from transcripts, with summary, key discussion
points, takeaways and action items with owners, in markdown format without code
blocks.
"""

user_prompt = f"""
Below is an extract transcript of a Denver council meeting.
Please write minutes in markdown without code blocks, including:
- a summary with attendees, location and date
- discussion points
- takeaways
- action items with owners
{open_source_transcription}
"""

messages = [
    {"role":"system", "content": system_message},
    {"role":"user", "content":user_prompt}
]

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer.apply_chat_template(messages, return_tensors='pt').to('cuda')
streamer = TextStreamer(tokenizer)
model = AutoModelForCausalLM.from_pretrained(LLAMA, quantization_config=quant_config, device_map='auto')
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=2000)

In [None]:
response = tokenizer.decode(outputs[0])