In [None]:
# Installs
!pip -q install -U youtube-transcript-api transformers sentencepiece accelerate

# Imports
import re
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
    CouldNotRetrieveTranscript,
    RequestBlocked,
    IpBlocked,
)
from transformers import pipeline, AutoTokenizer

In [None]:
# ---------------- Config ----------------
YOUTUBE_URL = "https://www.youtube.com/watch?v=A4OmtyaBHFE"  # <- change to your video if needed
PREFERRED_LANGS = ["en", "en-US", "en-GB"]

# Summarizer settings (BART has ~1024 token limit; keep headroom)
MODEL_ID = "facebook/bart-large-cnn"
MAX_INPUT_TOKENS = 900          # per chunk; <1024 for BART
TARGET_SUMMARY_RATIO = 0.25     # per-chunk compression target
FINAL_RATIO = 0.35              # final merge compression

# Decoding constraints to reduce boilerplate/hallucinations
DECODE_KW = dict(
    do_sample=False,
    num_beams=4,
    no_repeat_ngram_size=3,
    early_stopping=True,
)

In [None]:
# ---------------- Helpers ----------------
def extract_video_id(url: str) -> str:
    p = urlparse(url)
    if p.netloc in {"youtu.be"}:
        return p.path.lstrip("/")
    if p.path == "/watch":
        return parse_qs(p.query).get("v", [""])[0]
    parts = [seg for seg in p.path.split("/") if seg]
    return parts[-1] if parts else ""

def fetch_transcript_raw(video_id: str, langs=PREFERRED_LANGS) -> list[dict]:
    """
    Uses the NEW instance-based API:
      yta = YouTubeTranscriptApi(); fetched = yta.fetch(video_id, ...); fetched.to_raw_data()
    Falls back to listing/translation if needed.
    """
    yta = YouTubeTranscriptApi()
    try:
        fetched = yta.fetch(video_id, languages=list(langs))
        return fetched.to_raw_data()
    except NoTranscriptFound:
        tlist = yta.list(video_id)
        try:
            tr = tlist.find_transcript(list(langs))
            return tr.fetch().to_raw_data()
        except NoTranscriptFound:
            first = next(iter(tlist))  # raises StopIteration if none at all
            return first.translate("en").fetch().to_raw_data()

def clean_transcript(text: str) -> str:
    # Remove obvious noise/URLs; keep content intact
    text = re.sub(r"https?://\S+|www\.\S+", " ", text)
    text = re.sub(r"\[[^\]]*\]", " ", text)                 # [music], [applause], etc.
    text = re.sub(r"\s{2,}", " ", text).strip()
    return text

def sent_split(text: str) -> list[str]:
    ABBR = ["Mr.", "Ms.", "Mrs.", "Dr.", "Prof.", "Sr.", "Jr.",
            "St.", "vs.", "etc.", "e.g.", "i.e."]
    MASK = "§DOT§"  # unlikely to appear in real text

    # mask abbreviation periods
    for ab in ABBR:
        text = text.replace(ab, ab.replace(".", MASK))

    # split on sentence enders; keep it simple & robust
    parts = re.split(r'(?<=[.!?])\s+', text)

    # unmask and clean
    parts = [p.replace(MASK, ".").strip() for p in parts if p.strip()]
    return parts


def sentence_chunks_by_tokens(sents, tokenizer, max_tokens):
    buf, toks = [], 0
    for s in sents:
        ids = tokenizer.encode(s, add_special_tokens=False)
        if toks + len(ids) > max_tokens and buf:
            yield " ".join(buf)
            buf, toks = [s], len(ids)
        else:
            buf.append(s)
            toks += len(ids)
    if buf:
        yield " ".join(buf)

def dynamic_lengths(token_count: int, ratio: float, floor=40, ceil=240):
    tgt = max(floor, min(ceil, int(token_count * ratio)))
    mx = min(ceil, max(tgt + 40, int(tgt * 1.4)))
    return max(20, min(tgt, mx - 10)), mx

In [13]:
# ---------------- Main ----------------
video_id = extract_video_id(YOUTUBE_URL)
assert video_id, "Could not parse a YouTube video ID."

# 1) Get transcript (raw dicts with {text, start, duration})
try:
    raw_snippets = fetch_transcript_raw(video_id)
    if not raw_snippets:
        raise SystemExit("Transcript fetched but empty.")
except TranscriptsDisabled:
    raise SystemExit("Subtitles are disabled for this video.")
except (RequestBlocked, IpBlocked):
    raise SystemExit("YouTube is blocking this IP. Try locally or with a residential proxy.")
except CouldNotRetrieveTranscript as e:
    raise SystemExit(f"Could not retrieve transcript: {e}")
except NoTranscriptFound:
    raise SystemExit("No transcript exists in any language for this video.")

# 2) Print the FULL transcript (time-coded) and as one big block
print("=============== FULL TRANSCRIPT (time-coded) ===============\n")
for s in raw_snippets:
    start = s.get("start", 0.0)
    dur = s.get("duration", 0.0)
    txt = s.get("text", "")
    print(f"[{start:.2f}s → {start+dur:.2f}s] {txt}")

full_transcript_text = " ".join(s["text"] for s in raw_snippets if s.get("text"))
print("\n=============== FULL TRANSCRIPT (single block) ===============\n")
print(full_transcript_text)

# Also save to file (optional)
with open("transcript_full.txt", "w", encoding="utf-8") as f:
    f.write(full_transcript_text)

# 3) Summarize to a FULL narrative summary  (robust version)
clean_text = clean_transcript(full_transcript_text)

# Use the SAME tokenizer as the model and pass it into the pipeline
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
summarizer = pipeline(
    "summarization",
    model=MODEL_ID,
    tokenizer=tokenizer,
    framework="pt",
    device_map="auto",
)

def safe_summarize(txt, min_len, max_len):
    # guard against empty / ultra-short / corrupted inputs
    if not txt or txt.strip() == "":
        return ""
    # hard cap to the model's context length to avoid bad ids
    return summarizer(
        txt,
        min_length=min_len,
        max_length=max_len,
        truncation=True,            # <— important
        **DECODE_KW,
    )[0]["summary_text"]

# Split into token-aware sentence chunks
sents = sent_split(clean_text)
chunks = list(sentence_chunks_by_tokens(sents, tokenizer, MAX_INPUT_TOKENS))

chunk_summaries = []
for ch in chunks:
    # sanitize weird control chars that can create odd token ids
    ch = ch.encode("utf-8", "ignore").decode("utf-8")
    tl = len(tokenizer.encode(ch, add_special_tokens=False))
    if tl == 0:
        continue
    mn, mx = dynamic_lengths(tl, TARGET_SUMMARY_RATIO, floor=50, ceil=220)
    try:
        out = safe_summarize(ch, mn, mx)
    except IndexError:
        # Rarely BART hits an embedding OOR; retry with a smaller model
        fallback = "sshleifer/distilbart-cnn-12-6"
        fb_tok = AutoTokenizer.from_pretrained(fallback, use_fast=True)
        fb_pipe = pipeline("summarization", model=fallback, tokenizer=fb_tok, framework="pt", device_map="auto")
        out = fb_pipe(ch, min_length=max(30, int(mn*0.8)), max_length=max(60, int(mx*0.8)), truncation=True, **DECODE_KW)[0]["summary_text"]
    chunk_summaries.append(out)

# Final synthesis
merge_text = " ".join(chunk_summaries)
merge_text = merge_text.encode("utf-8", "ignore").decode("utf-8")
mtoks = len(tokenizer.encode(merge_text, add_special_tokens=False))
m_min, m_max = dynamic_lengths(mtoks, FINAL_RATIO, floor=80, ceil=260)
try:
    final_summary = safe_summarize(merge_text, m_min, m_max)
except IndexError:
    # fallback once more for the merge step
    fallback = "sshleifer/distilbart-cnn-12-6"
    fb_tok = AutoTokenizer.from_pretrained(fallback, use_fast=True)
    fb_pipe = pipeline("summarization", model=fallback, tokenizer=fb_tok, framework="pt", device_map="auto")
    final_summary = fb_pipe(merge_text, min_length=max(60, int(m_min*0.8)), max_length=max(140, int(m_max*0.8)), truncation=True, **DECODE_KW)[0]["summary_text"]

print("\n=============== FULL SUMMARIZED TEXT ===============\n")
print(final_summary)

with open("summary_full.txt", "w", encoding="utf-8") as f:
    f.write(final_summary)
print("\nFiles saved: transcript_full.txt, summary_full.txt")



[1.44s → 6.90s] for Germany it's the end of an era and
[4.86s → 9.66s] as Europe's biggest economy there are
[6.90s → 11.22s] some huge challenges ahead
[9.66s → 13.50s] from its increasingly complex
[11.22s → 16.20s] relationship with China to climate
[13.50s → 19.74s] change all eyes will be on how Germany's
[16.20s → 21.60s] new leaders grapple with these issues
[19.74s → 23.28s] but there's one German industry that
[21.60s → 25.38s] reveals a lot about the country's
[23.28s → 26.82s] prospects
[25.38s → 28.86s] it's cars
[26.82s → 30.66s] traditionally the car industry here has
[28.86s → 32.88s] been very powerful there have been open
[30.66s → 33.96s] doors to German Ministries to the
[32.88s → 36.54s] Chancery
[33.96s → 38.88s] Germany's car industry is a vital part
[36.54s → 41.46s] of its economy with links to government
[38.88s → 43.44s] that go back decades
[41.46s → 45.78s] how this world renowned motor industry
[43.44s → 47.40s] navigates the challenges ahead could
[45.78s

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Device set to use cpu




Germany's car industry is a vital part of its economy with links to government that go back decades. The industry has proved of Bellwether for the country's future prospects too since the 70s car exports have risen with Germany's wealth and influence manufacturing is incredibly important in Germany it represents around 18 of GDP and of course the car sector represents a very large part of that. Germany's new leader will face the same challenge as its car industry how to keep their biggest trading partner on side without ending up in its pocket.

Files saved: transcript_full.txt, summary_full.txt


In [14]:
# ---------------- SAVE TRANSCRIPT + SUMMARY TO CSV (WITH TIMESTAMPS) ----------------
import math
import pandas as pd

# 1) Per-segment transcript (start/end are in seconds)
segments_df = pd.DataFrame(
    [
        {
            "start": float(s.get("start", 0.0)),
            "end": float(s.get("start", 0.0)) + float(s.get("duration", 0.0)),
            "text": s.get("text", "").strip(),
        }
        for s in raw_snippets
    ]
)

# If your transcript source doesn't include "duration", compute it from deltas:
if "duration" not in raw_snippets[0]:
    # derive end from the next segment's start, last one gets same as start (or add a small epsilon)
    starts = segments_df["start"].tolist()
    ends = starts[1:] + [starts[-1]]
    segments_df["end"] = ends

segments_df.to_csv("transcript_timecoded.csv", index=False)
print("[CSV] Saved per-segment transcript -> transcript_timecoded.csv")

# 2) Windowed transcript + summary (timestamps)
#    Group transcript into fixed windows, then summarize each window's text.
WINDOW_SECONDS = 30  # adjust as you like (e.g., 15, 30, 60)

def summarize_short_block(txt: str):
    # use your existing tokenizer/summarizer (already created above)
    # choose dynamic lengths based on token count; keep it brief per window
    ids = tokenizer.encode(txt, add_special_tokens=False)
    if not ids:
        return ""
    mn, mx = dynamic_lengths(len(ids), ratio=0.30, floor=40, ceil=160)
    return summarizer(
        txt,
        min_length=mn,
        max_length=mx,
        truncation=True,
        **DECODE_KW,
    )[0]["summary_text"]

# assign window id per segment by its start time
segments_df["window_id"] = (segments_df["start"] // WINDOW_SECONDS).astype(int)

# aggregate text per window & compute start/end bounds
agg = (
    segments_df
    .groupby("window_id", as_index=False)
    .agg(
        window_start=("start", "min"),
        window_end=("end", "max"),
        window_text=("text", lambda xs: " ".join(t for t in xs if isinstance(t, str) and t.strip()))
    )
)

# summarize each window
summaries = []
for row in agg.itertuples(index=False):
    txt = row.window_text.strip()
    if txt:
        try:
            s = summarize_short_block(txt)
        except Exception:
            # very rare fallback—shrink the target lengths if any window trips limits
            ids = tokenizer.encode(txt, add_special_tokens=False)
            mn, mx = dynamic_lengths(len(ids), ratio=0.20, floor=30, ceil=120)
            s = summarizer(txt, min_length=mn, max_length=mx, truncation=True, **DECODE_KW)[0]["summary_text"]
    else:
        s = ""
    summaries.append(s)

agg["window_summary"] = summaries

# nice, human timestamps (HH:MM:SS)
def hhmmss(sec: float) -> str:
    sec = max(0, int(round(sec)))
    h = sec // 3600
    m = (sec % 3600) // 60
    s = sec % 60
    return f"{h:02d}:{m:02d}:{s:02d}"

agg["window_start_hms"] = agg["window_start"].apply(hhmmss)
agg["window_end_hms"]   = agg["window_end"].apply(hhmmss)

# reorder columns
agg = agg[[
    "window_id",
    "window_start", "window_end",
    "window_start_hms", "window_end_hms",
    "window_text",
    "window_summary",
]]

agg.to_csv("transcript_windows_with_summaries.csv", index=False)
print("[CSV] Saved windowed transcript + per-window summary -> transcript_windows_with_summaries.csv")


Your max_length is set to 80, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)


[CSV] Saved per-segment transcript -> transcript_timecoded.csv


Your max_length is set to 80, but your input_length is only 63. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)
Your max_length is set to 80, but your input_length is only 78. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=39)
Your max_length is set to 80, but your input_length is only 67. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=33)
Your max_length is set to 80, but your input_length is only 71. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=35)
Your max

[CSV] Saved windowed transcript + per-window summary -> transcript_windows_with_summaries.csv
