# Create meeting minutes from an Audio file

I downloaded some Denver City Council meeting minutes and selected a portion of the meeting for us to transcribe. You can download it here:  
https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing

If you'd rather work with the original data, the HuggingFace dataset is [here](https://huggingface.co/datasets/huuuyeah/meetingbank) and the audio can be downloaded [here](https://huggingface.co/datasets/huuuyeah/MeetingBank_Audio/tree/main).

The goal of this product is to use the Audio to generate meeting minutes, including actions.

For this project, you can either use the Denver meeting minutes, or you can record something of your own!


In [1]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0 openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.3/7.3 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [10]:
# imports
import torch
import threading
from openai import OpenAI
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig
import gradio as gr

In [11]:
# Constants

AUDIO_MODEL = "whisper-1"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [12]:
# Google Colab User Data
# Ensure you have set the following in your Google Colab environment:
hf_token = userdata.get('HF_TOKEN')
openai_api_key = userdata.get('OPEN_AI')

# Download denver_extract.mp3

You can either use the same file as me, the extract from Denver city council minutes, or you can try your own..

If you want to use the same as me, then please download my extract here, and put this on your Google Drive:  
https://drive.google.com/file/d/1N_kpSojRR5RYzupz6nqM8hMSoEF_R7pU/view?usp=sharing


In [6]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [13]:
# Sign in to OpenAI using Secrets in Colab

openai_api_key = userdata.get('OPEN_AI')
openai = OpenAI(api_key=openai_api_key)

In [15]:
class MeetingAssistant:
    def __init__(self, model_name=LLAMA, audio_model=AUDIO_MODEL):

        # Load tokenizer and llm model
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_type="nf4"
        )

        self.audio_model = audio_model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            quantization_config=quant_config
        )

    def transcribe_audio(self, audio_path, progress):
        """Transcribes the uploaded audio file using OpenAI Whisper API."""

        progress(0.3, desc="Transcribing audio...")

        try:
            with open(audio_path, "rb") as audio_file:
                transcription = openai.audio.transcriptions.create(
                    model=self.audio_model,
                    file=audio_file,
                    response_format="text"
                )
                return transcription
        except Exception as e:
            return f"Error during transcription: {str(e)}"

    def generate_minutes(self, transcription, progress):
        """Generates meeting minutes from the transcript using the Llama model."""
        progress(0.6, desc="Generating meeting minutes...")

        system_message = "You are an assistant that produces minutes of meetings from transcripts, with summary, key discussion points, takeaways and action items with owners, in markdown."
        user_prompt = f"Below is an extract transcript of a meeting. Please write minutes in markdown, including a summary with attendees, location and date; discussion points; takeaways; and action items with owners.\n{transcription}"

        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_prompt}
        ]

        inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
        streamer = TextIteratorStreamer(self.tokenizer)

        thread = threading.Thread(
            target=self.model.generate, kwargs={
                    "input_ids": inputs,
                    "max_new_tokens": 2000,
                    "streamer": streamer
                  })
        thread.start()


        started = False
        # buffer = ""
        for new_text in streamer:
          if not started:
              if "<|start_header_id|>assistant<|end_header_id|>" in new_text:
                  started = True
                  new_text = new_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()

          if started:
              if "<|eot_id|>" in new_text:
                  new_text = new_text.replace("<|eot_id|>", "")  # Remove the unwanted token

              if new_text.strip():  # Only yield non-empty chunks
                  yield new_text

    def process_meeting(self, audio_file, progress):
        """Handles the complete process: transcribes audio and generates minutes."""
        progress(0.1, desc="Processing audio file...")

        # Check if a file is uploaded
        if audio_file is None:
            return "Please upload an audio file."

        try:
          # Check file format
          if not str(audio_file).lower().endswith('.mp3'):
              return "Please upload an MP3 file."

          # Get transcription
          transcription = self.transcribe_audio(audio_file, progress)

          # Generate minutes
          accumulated_text = ""
          minutes = self.generate_minutes(transcription, progress)
          for chunk in minutes:
            accumulated_text += chunk  # Append new text
            yield accumulated_text   # Update Gradio output with full text

        except Exception as e:
          return f"Error processing file: {str(e)}"

In [16]:
class GradioInterface:
    def __init__(self):
        """Initializes the Gradio interface for processing audio files."""
        self.assistant = MeetingAssistant()
        self.iface = gr.Interface(
            fn=self.process_audio,
            inputs=gr.Audio(type="filepath", label="Upload MP3 File", format="mp3"),
            outputs=gr.Markdown(label="Meeting Minutes", min_height=60),
            title="AI Meeting Assistant",
            description="Upload an audio file to transcribe and generate meeting minutes.",
            flagging_mode="never"
        )

    def process_audio(self, audio_file, progress=gr.Progress()): # Adapter between the UI and the backend.
        """Handles user input from Gradio, processes the audio, and returns meeting minutes."""
        response = self.assistant.process_meeting(audio_file, progress)
        for chunk in response:
          yield chunk

    def launch(self):
        """Launches the Gradio interface."""
        self.iface.launch()

In [17]:
if __name__ == "__main__":
    app = GradioInterface()
    app.launch()

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://73e9ba1453a80c1a4c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
