In [1]:
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import whisper
import torchaudio
import librosa
import os
import re
import tqdm as notebook_tqdm

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

# Check if GPU is available 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Limit to 70% of total GPU memory
torch.cuda.set_per_process_memory_fraction(0.7, device=0)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [22]:
torch.cuda.empty_cache()

In [2]:
allocated = torch.cuda.memory_allocated() / 1024**2
reserved = torch.cuda.memory_reserved() / 1024**2
print(f"{allocated:.2f} MB allocated, {reserved:.2f} MB reserved")


0.00 MB allocated, 0.00 MB reserved


In [3]:
custom_model_dir = "./models/whisper"


# Load processor and model
ASR_model = whisper.load_model('small', download_root=custom_model_dir)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ASR_model.to(device)


Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-11): 12 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=False)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (out): Linear(in_features=768, out_features=768, bias=True)
        )
        (attn_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (mlp_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-0

In [4]:
def transcribe_audio(audio_path, chunk_duration_sec=30):
    """
    Transcribe long audio using OpenAI Whisper with manual chunking.
    """
    # Load and resample audio
    waveform, sample_rate = torchaudio.load(audio_path)

    # Resample to 16000 Hz
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    waveform = waveform.squeeze()  # mono
    total_samples = waveform.shape[0]
    chunk_size = int(sample_rate * chunk_duration_sec)

    transcriptions = []

    for start in range(0, total_samples, chunk_size):
        end = min(start + chunk_size, total_samples)
        chunk = waveform[start:end].cpu().numpy()

        # Whisper expects 16-bit float PCM data
        audio_np = chunk.astype("float32")

        # Transcribe each chunk
        result = ASR_model.transcribe(audio_np, language="en")
        transcriptions.append(result["text"].strip())

        torch.cuda.empty_cache()

    return " ".join(transcriptions)

In [5]:
custom_model_dir = "./models/bart"

# Load BART model and tokenizer
bart_model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(bart_model_name, cache_dir=custom_model_dir)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name, cache_dir=custom_model_dir)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bart_model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
    

In [6]:
def summarize_text(text, max_chunk_tokens=512, summary_max_length=150):
    """Summarize text using BART with chunking."""
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        tokens = tokenizer.encode(current_chunk + sentence, truncation=False)
        if len(tokens) <= max_chunk_tokens:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence

    if current_chunk:
        chunks.append(current_chunk.strip())

    all_summaries = []
    for chunk in chunks:
        inputs = tokenizer(
            chunk,
            return_tensors="pt",
            max_length=1024,
            truncation=True,
            padding="max_length"
        ).to(device)

        with torch.no_grad():
            summary_ids = bart_model.generate(
                inputs["input_ids"],
                max_length=summary_max_length,
                min_length=30,
                num_beams=4,
                no_repeat_ngram_size=3,
                early_stopping=True
            )

        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        all_summaries.append(summary)

        # Clear memory
        del inputs
        torch.cuda.empty_cache()

    return "\n".join(all_summaries)

In [7]:
# Sample usage
audio_path = "amicorpus/ES2002a/audio/ES2002a.Mix-Headset.wav"  # Replace with your actual audio file
transcription = transcribe_audio(audio_path)
print("Transcription:", transcription)

Transcription: I think it's already on actually. God, how do I make this thing work? I've plugged it in the back. Okay, right. Okay. Right. Well, this is the kickoff meeting for our project. Okay. And this is just what we're going to be doing over the next 25 minutes. So first of all, just to kind of make sure that we all know each other, I'm Laura and I'm the project manager. Do you want to introduce yourself again? Hi, I'm David and I'm supposed to be an industrial designer. Okay. And I'm Andrew and I'm a marketing expert. I'm Greg and I'm user interface. Great. OK. So we're designing a new remote control. And oh, I have to record he's here, actually. So that's David, Andrew, and Craig, isn't it? And you all arrived on time. Yes, we did design a new remote control. As you can see, it's supposed to be original, trendy, and user-friendly. So that's kind of our brief. So there are three different stages to the design. I'm not really sure what you guys have already received. In your emai

In [26]:
summary = summarize_text(transcription)
print("Summary:", summary)

Summary: Project manager: We're designing a new remote control. You get to draw your favorite animal and sum up your favorite characteristics of it. Who would like to go first? I will go.
"I don't know what mine is, I'm going to have to think in the spotlight. Impressionist. Is that a whale? Yeah. The biggest reason is because I'm allergic to most animals"
Do you think the fact that it's going to be sold internationally will have a bearing on how we design it at all? I think it will. I'm wondering if there's like with DVD players if there are zones. Frequencies or something.
Remote control is just like getting shoelaces with shoes or something. My parents got fed up of having like four or five different remote controls for each house. One of the priorities might be to combine as many uses.
You keep losing them. Finding them is making a pain, you know. I remember when the first remote control was on a cable actually the cable between in the TV and big like buttons that sort of like like