In [1]:
import librosa
import soundfile as sf
from tqdm import tqdm
import whisper 

In [15]:
y, sr = librosa.load("test.mp3", sr=None)
chunk_length = 300 * sr  # 5 min * sample rate
# Process in chunks
for i in tqdm(range(0, len(y), chunk_length), desc="Chunking audio"):
    # Write each chunk as WAV file
    sf.write(
        f"./chunks/chunk_{i//chunk_length}.wav",
        y[i:i+chunk_length],
        sr,
        format='WAV'  # Explicit format
    )

Chunking audio: 100%|██████████| 2/2 [00:00<00:00,  5.08it/s]


In [None]:
model = whisper.load_model("base")

In [17]:
import os 
chunks_folder = "./chunks/"
# Transcribe each chunk
output_file = "transcription.txt"

chunk_files = sorted([f for f in os.listdir(chunks_folder) if f.endswith('.wav')])

with open(output_file, "w", encoding="utf-8") as f:
    for chunk_file in tqdm(chunk_files, desc="Transcribing chunks"):
        result = model.transcribe(os.path.join(chunks_folder, chunk_file))
        f.write(f"Transcription for {chunk_file}:\n")
        f.write(result['text'] + "\n\n")
        print(f"Processed {chunk_file}")

print(f"Transcription saved to {output_file}")

Transcribing chunks:  50%|█████     | 1/2 [00:34<00:34, 34.01s/it]

Processed chunk_0.wav


Transcribing chunks: 100%|██████████| 2/2 [00:38<00:00, 19.01s/it]

Processed chunk_1.wav
Transcription saved to transcription.txt





In [20]:
import torch
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


In [23]:
with open("transcription.txt", "r", encoding="utf-8") as f:
    content = f.read()

In [24]:
sections = {}
current_chunk = []
for line in content.splitlines():
    if line.startswith("Transcription for"):
        current_chunk = line.split(" ")[-1].replace(":", "").replace(".wav", "")
        sections[current_chunk] = ""
    else:
        if current_chunk and line.strip():
            sections[current_chunk] += line.strip() + " "

In [27]:
# Group into 5-minute sections 
grouped = {}
chunks_per_5min = 10  # adjust if your chunks are different
sorted_chunks = sorted(sections.keys(), key=lambda x: int(x.split("_")[-1]))

In [29]:
max_input_length = 1024  # tokens, but roughly ~1024 words for BART
for i in range(0, len(sorted_chunks), chunks_per_5min):
    group_id = i // chunks_per_5min + 1
    start_min = (i * 30) // 60
    end_min = ((i + chunks_per_5min) * 30) // 60
    section_label = f"Section {group_id}: Minute {start_min}–{end_min}"

    text_block = " ".join(sections[ch] for ch in sorted_chunks[i:i+chunks_per_5min])
    # Truncate text_block if too long
    text_block = text_block[:4000]  # 4000 chars is usually safe for BART

    summary = summarizer(text_block, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
    grouped[section_label] = summary

In [31]:
for sec, summ in grouped.items():
    print(f"\n=== {sec} ===\n {summary} \n")


=== Section 1: Minute 0–5 ===
 You need to put in 10,000 hours of work to become an expert at a thing. I think a lot of people in the beginner stage get paralyzed by the choice. I wouldn't say I hate teaching, I tolerate teaching, but it's not like the act of teaching that I like. 



In [32]:
with open("summary.txt", "w", encoding="utf-8") as f:
    for sec, summ in grouped.items():
        f.write(f"=== {sec} ===\n{summ}\n\n")
print("Summaries saved to summary.txt")

Summaries saved to summary.txt
