## Install dependencies

In [1]:
!pip install -q demucs
!pip install -q transformers torchaudio librosa accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.1/87.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.7/249.7 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━

In [2]:
from demucs.apply import apply_model
from demucs.pretrained import get_model
from demucs.audio import AudioFile
import torchaudio
import torch
import os

## Noise reduction model

In [3]:
model = get_model(name='htdemucs')
# model.cpu()
model.eval()

Downloading: "https://dl.fbaipublicfiles.com/demucs/hybrid_transformer/955717e8-8726e21a.th" to /root/.cache/torch/hub/checkpoints/955717e8-8726e21a.th
100%|██████████| 80.2M/80.2M [00:00<00:00, 176MB/s] 


BagOfModels(
  (models): ModuleList(
    (0): HTDemucs(
      (encoder): ModuleList(
        (0): HEncLayer(
          (conv): Conv2d(4, 48, kernel_size=(8, 1), stride=(4, 1), padding=(2, 0))
          (norm1): Identity()
          (rewrite): Conv2d(48, 96, kernel_size=(1, 1), stride=(1, 1))
          (norm2): Identity()
          (dconv): DConv(
            (layers): ModuleList(
              (0): Sequential(
                (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(1,))
                (1): GroupNorm(1, 6, eps=1e-05, affine=True)
                (2): GELU(approximate='none')
                (3): Conv1d(6, 96, kernel_size=(1,), stride=(1,))
                (4): GroupNorm(1, 96, eps=1e-05, affine=True)
                (5): GLU(dim=1)
                (6): LayerScale()
              )
              (1): Sequential(
                (0): Conv1d(48, 6, kernel_size=(3,), stride=(1,), padding=(2,), dilation=(2,))
                (1): GroupNorm(1, 6, eps=1e-05, affine=True)
  

In [4]:
input_audio_path = "/kaggle/input/whisper-test-5/audio test timeless.unknown"
output_dir = "/kaggle/working/denoised"
os.makedirs(output_dir, exist_ok=True)

In [5]:
source = AudioFile(input_audio_path)
ref = source.read(streams=0, channels=1)
wav = ref[0]
sample_rate = source.samplerate()

In [6]:
# Step 1: Ensure wav is a tensor with shape (1, T)
if not isinstance(wav, torch.Tensor):
    wav = torch.tensor(wav)

In [7]:
if wav.ndim == 1:
    wav = wav.unsqueeze(0)
print(wav.shape)

torch.Size([1, 10129408])


In [8]:
if wav.shape[0] == 1:
    wav = torch.cat([wav, wav], dim=0)
print(wav.shape)

torch.Size([2, 10129408])


In [9]:
wav = wav.unsqueeze(0).float()
print(wav.shape)

torch.Size([1, 2, 10129408])


In [10]:
with torch.no_grad():
    sources = apply_model(model, wav)

In [11]:
vocals = sources[0][3]
vocals_path = os.path.join(output_dir, "vocals.wav")
torchaudio.save(vocals_path, vocals.cpu(), sample_rate)

print("✅ Denoising complete. Saved vocals to:", vocals_path)

✅ Denoising complete. Saved vocals to: /kaggle/working/denoised/vocals.wav


In [None]:
from IPython.display import Audio

Audio(vocals_path)

## Speech to Text 

In [12]:
from transformers import pipeline
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", return_timestamps=True)
result = asr("/kaggle/working/denoised/vocals.wav")
print(result["text"])

2025-07-01 18:40:42.500564: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751395242.722512      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751395242.785229      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


 Down the block there's an antique shop and something in my head said stop so I walked in On the counter was a cardboard box and the sign said photos, 25 cents each Black and white, saw his thirties bright And two lovers slapping on the board to their first house The kind of love that you only find once in a lifetime The kind you don't put down And that's when I called you and it's so hard to explain But in those photos I saw us instead And somehow I know that you and I would've found each other In another life, you still would've turned my head Even if we'd been on a crowded street in 1944 And you were headed off to fight in the war You still would've been mine, we would've been timeless I would've read your love letters every single night And prayed to God you'd be coming home alright And you would've been fine, we would've been timeless Cause I believe that we were supposed to find this So even in a different life, you still would've been mine We would've been timeless I had to smil

In [None]:
transcription = result["text"]

In [13]:
transcript = result['text']

## Summarization

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, PegasusTokenizer, PegasusForConditionalGeneration
from textwrap import wrap
import torch

In [15]:
# Load mT5 model
mt5_model_name = "csebuetnlp/mT5_multilingual_XLSum"
mt5_tokenizer = AutoTokenizer.from_pretrained(mt5_model_name)
mt5_model = AutoModelForSeq2SeqLM.from_pretrained(mt5_model_name)

# Load Pegasus model
pegasus_model_name = "google/pegasus-multi_news"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_name)

#gpu or cpu
device = "cuda" if torch.cuda.is_available() else "cpu"
mt5_model.to(device)
pegasus_model.to(device)

tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/730 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-multi_news and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_no

In [16]:
def chunk_transcript(transcript, max_chunk_chars=1000):
    return wrap(transcript, width=max_chunk_chars, break_long_words=False)

def summarize_with_mt5(text):
    input_ids = mt5_tokenizer(text, return_tensors="pt", truncation=True, max_length=512).input_ids.to(device)
    summary_ids = mt5_model.generate(input_ids, max_length=80, num_beams=4)
    return mt5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def summarize_all_chunks(chunks):
    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i+1}/{len(chunks)}...")
        chunk_summary = summarize_with_mt5(chunk)
        chunk_summaries.append(chunk_summary)
    return " ".join(chunk_summaries)

def summarize_with_pegasus(text):
    inputs = pegasus_tokenizer(text, return_tensors="pt", truncation=True, max_length=1024).input_ids.to(device)
    summary_ids = pegasus_model.generate(inputs, max_length=150, num_beams=4)
    return pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def full_summary_pipeline(transcript):
    chunks = chunk_transcript(transcript)
    print(f"\n Total Chunks: {len(chunks)}")
    combined_chunk_summary = summarize_all_chunks(chunks)
    print("\n Intermediate Summary:\n", combined_chunk_summary)
    
    print("\n Generating Final Summary using PEGASUS...")
    final_summary = summarize_with_pegasus(combined_chunk_summary)
    return final_summary

final_summary = full_summary_pipeline(transcript)
print("\n✅ Final Summary:\n", final_summary)


 Total Chunks: 2
Summarizing chunk 1/2...
Summarizing chunk 2/2...

 Intermediate Summary:
 The love story of a British couple who were killed in World War Two has been told by the BBC. When I first saw your face in the 1500s off in a foreign land, I was forced to marry another man.

 Generating Final Summary using PEGASUS...

✅ Final Summary:
 – The story of a British couple who were killed in World War II has been told by the BBC. The love story of a British couple who were killed in World War II has been told by the BBC. "When I first saw your face in the 1500s off in a foreign land, I was forced to marry another man," wrote one of them in a letter to the couple's daughter. "When I first saw your face in the 1930s off in a foreign land, I was forced to marry another man."


In [None]:
# from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# # Load model and tokenizer
# model_name = "google/pegasus-large"
# tokenizer = PegasusTokenizer.from_pretrained(model_name)
# pegasus = PegasusForConditionalGeneration.from_pretrained(model_name)

# def summarize_text(text):
#     inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
#     summary_ids = pegasus.generate(inputs["input_ids"], max_length=150, min_length=15, length_penalty=2.0, num_beams=4)
#     summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
#     return summary

# # Summarize transcription
# summary = summarize_text(transcription)
# print("📌 Summary:\n", summary)


In [None]:
# from transformers import pipeline
# import textwrap

# # Load summarization pipeline
# summarizer = pipeline("summarization", model="google/pegasus-large", tokenizer="google/pegasus-large")

# # Split long text into chunks (1000 tokens ~ 3000-3500 characters)
# def split_into_chunks(text, max_chunk_size=3500):
#     return textwrap.wrap(text, width=max_chunk_size, break_long_words=False)

# # Assume 'transcript' contains your full Whisper output
# chunks = split_into_chunks(transcript)

# # Summarize each chunk
# chunk_summaries = [summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]["summary_text"] for chunk in chunks]

# # Optional: summarize all summaries into one
# final_summary = summarizer(" ".join(chunk_summaries), max_length=150, min_length=50, do_sample=False)[0]["summary_text"]

# print("Final Summary:\n", final_summary)
