In [1]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import whisper
import torchaudio
import librosa
import os
import re
import tqdm as notebook_tqdm

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

# Check if GPU is available 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Limit to 90% of total GPU memory
torch.cuda.set_per_process_memory_fraction(0.9, device=0)


Using device: cuda


In [2]:
torch.cuda.empty_cache()

In [3]:
allocated = torch.cuda.memory_allocated() / 1024**2
reserved = torch.cuda.memory_reserved() / 1024**2
print(f"{allocated:.2f} MB allocated, {reserved:.2f} MB reserved")


0.00 MB allocated, 0.00 MB reserved


In [4]:
custom_model_dir = "./models/whisper"


# Load processor and model
ASR_model = whisper.load_model('medium', download_root=custom_model_dir)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ASR_model.to(device)


Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 1024, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1024, 1024, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-23): 24 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (key): Linear(in_features=1024, out_features=1024, bias=False)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (out): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (attn_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
        (mlp_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((

In [5]:
def transcribe_audio(audio_path, chunk_duration_sec=30):
    """
    Transcribe long audio using OpenAI Whisper with manual chunking.
    """
    # Load and resample audio
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample to 16000 Hz
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    waveform = waveform.squeeze()  # mono
    total_samples = waveform.shape[0]
    chunk_size = int(sample_rate * chunk_duration_sec)

    transcriptions = []

    for start in range(0, total_samples, chunk_size):
        end = min(start + chunk_size, total_samples)
        chunk = waveform[start:end].cpu().numpy()

        # Whisper expects 16-bit float PCM data
        audio_np = chunk.astype("float32")

        # Transcribe each chunk
        result = ASR_model.transcribe(audio_np, language="en")
        transcriptions.append(result["text"].strip())

        torch.cuda.empty_cache()

    return " ".join(transcriptions)

In [6]:
custom_model_dir = "./models/meetingSum-bart"

tokenizer = BartTokenizer.from_pretrained(custom_model_dir)
bart_model = BartForConditionalGeneration.from_pretrained(custom_model_dir)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [7]:
def summarize_text(text, max_chunk_tokens=1024, summary_max_length=250):
    """Summarize each chunks with the BART model."""
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        tokens = tokenizer.encode(current_chunk + sentence, truncation=False)
        if len(tokens) <= max_chunk_tokens:
            current_chunk += " " + sentence
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    all_summaries = []
    for chunk in chunks:
        inputs = tokenizer(
            chunk,
            return_tensors="pt",
            max_length=1024,  # Max input for BART-large
            truncation=True,
            padding="max_length"
        ).to(device)

        with torch.no_grad():
            summary_ids = bart_model.generate(
                inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_length=summary_max_length,     # 🔼 Increase this as needed (up to ~512)
                min_length=80,                     # Optional: force a minimum length
                length_penalty=2.0,                # Encourage longer summaries
                num_beams=4,
                no_repeat_ngram_size=3,
                early_stopping=True
            )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        all_summaries.append(summary.strip())

        # Free memory
        del inputs
        torch.cuda.empty_cache()

    return all_summaries

In [8]:
def summarize_chunks(chunks, summary_max_length=250):
    """Take a list of chunk summaries and generate a final summary."""
    # Join all chunk summaries into one text
    combined_text = " ".join(chunks)

    # Tokenize the combined summaries
    inputs = tokenizer(
        combined_text,
        return_tensors="pt",
        max_length=1024,
        truncation=True,
        padding="max_length"
    ).to(device)

    # Generate the final summary
    with torch.no_grad():
        summary_ids = bart_model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=summary_max_length,
            min_length=80,
            length_penalty=2.0,
            num_beams=4,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

    final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return final_summary.strip()

In [9]:
# Sample usage
audio_path = "amicorpus/ES2003a/ES2003a.Mix-Headset.wav"  # Replace with your actual audio file
transcription = transcribe_audio(audio_path)
print("Transcription:", transcription)

Transcription: Okay, well I think we're ready to begin. Right, my name's Adam Duggard, we're here because of Real Reaction. We have in the group... Ebenezer Ademusoy, would you like me to spell it out? N-E-Z-E-R Your role is? I'm the marketing expert. Next we have Tarik Rammer, T-A-R-I-K, and your role in this is industrial designer. And lastly we have Dave Cochran, and you're going to be the user interface designer. Right, this is the agenda for today's meeting. As you can see, opening, acquaintance, tool training, project plan, discussion and closing. We already got through opening and partially through acquaintance. So the reason we're here, we're going to design a new remote control as you probably all know. The very broad overview is original, trendy and user friendly. Of course, we'll have to go into a bit more detail than that. Personally I think the original is going to be a very key aspect of this design. There's a lot of remote controls out there anyway, so we're going to nee

In [10]:
summary = summarize_text(transcription)
final_summary = summarize_chunks(summary)
print("Final Summary:", final_summary)

Final Summary: In the meeting, Adam Duggard, the group responsible for designing a new, trendy, and user-friendly remote control for an interactive television, was briefed on the design process, including functional, conceptual, and detailed design. The remote control will have an original design, a working design, and a detailed analysis of components properties, materials, and features. The team is aiming for a profit of 25 million euros, with a target selling price of 50 million euros and a goal of selling 50 million units in the next financial year. The design process will progress through functional, detailed, and conceptual phases. Tarik Rammer, T-A-R-I-K, and Dave Cochran are the industrial designer. The group agreed on ergonomics, aesthetics, and potential inclusion of speech recognition technology. The


In [13]:
def save_to_txt(transcription, summary, output_path="output.txt"):
    """Save the transcription and summary to a single text file."""
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("=== TRANSCRIPTION WITH WHISPER MEDIUM===\n")
        f.write(transcription + "\n\n")
        f.write("=== SUMMARY WITH BART===\n")
        f.write("\n".join(summary) + "\n")


In [14]:
save_to_txt(transcription, summary, output_path="meeting_notes.txt")