## Install dependencies

In [None]:
!pip install -q demucs
!pip install -q transformers torchaudio librosa accelerate

In [None]:
from demucs.apply import apply_model
from demucs.pretrained import get_model
from demucs.audio import AudioFile
import torchaudio
import torch
import os

## Noise reduction model

In [None]:
model = get_model(name='htdemucs')
# model.cpu()
model.eval()

In [None]:
input_audio_path = "/kaggle/input/whisper-test-5/audio test timeless.unknown"
output_dir = "/kaggle/working/denoised"
os.makedirs(output_dir, exist_ok=True)

In [None]:
source = AudioFile(input_audio_path)
ref = source.read(streams=0, channels=1)
wav = ref[0]
sample_rate = source.samplerate()

In [None]:
# Step 1: Ensure wav is a tensor with shape (1, T)
if not isinstance(wav, torch.Tensor):
    wav = torch.tensor(wav)

In [None]:
if wav.ndim == 1:
    wav = wav.unsqueeze(0)
print(wav.shape)

In [None]:
if wav.shape[0] == 1:
    wav = torch.cat([wav, wav], dim=0)
print(wav.shape)

In [None]:
wav = wav.unsqueeze(0).float()
print(wav.shape)

In [None]:
with torch.no_grad():
    sources = apply_model(model, wav)

In [None]:
vocals = sources[0][3]
vocals_path = os.path.join(output_dir, "vocals.wav")
torchaudio.save(vocals_path, vocals.cpu(), sample_rate)

print("✅ Denoising complete. Saved vocals to:", vocals_path)

In [None]:
from IPython.display import Audio

Audio(vocals_path)

## Speech to Text 

In [None]:
from transformers import pipeline 

In [None]:
from transformers import pipeline
asr = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3", return_timestamps=True)
result = asr("/kaggle/working/denoised/vocals.wav")
print(result["text"])

In [None]:
transcription = result["text"]

In [None]:
transcript = result['text']

## Summarization

In [None]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load model and tokenizer
model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
pegasus = PegasusForConditionalGeneration.from_pretrained(model_name)

def summarize_text(text):
    inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")
    summary_ids = pegasus.generate(inputs["input_ids"], max_length=150, min_length=15, length_penalty=2.0, num_beams=4)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Summarize transcription
summary = summarize_text(transcription)
print("📌 Summary:\n", summary)


In [None]:
from transformers import pipeline
import textwrap

# Load summarization pipeline
summarizer = pipeline("summarization", model="google/pegasus-large", tokenizer="google/pegasus-large")

# Split long text into chunks (1000 tokens ~ 3000-3500 characters)
def split_into_chunks(text, max_chunk_size=3500):
    return textwrap.wrap(text, width=max_chunk_size, break_long_words=False)

# Assume 'transcript' contains your full Whisper output
chunks = split_into_chunks(transcript)

# Summarize each chunk
chunk_summaries = [summarizer(chunk, max_length=120, min_length=30, do_sample=False)[0]["summary_text"] for chunk in chunks]

# Optional: summarize all summaries into one
final_summary = summarizer(" ".join(chunk_summaries), max_length=150, min_length=50, do_sample=False)[0]["summary_text"]

print("Final Summary:\n", final_summary)
