Part 2 - YouTube Transcript



In [6]:
!pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.2.4-py3-none-any.whl.metadata (24 kB)
Downloading youtube_transcript_api-1.2.4-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.2/485.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.2.4


In [4]:
!pip install langchain.tools



In [1]:
from google.colab import userdata
import os

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

In [2]:
!pip install youtube-transcript-api langchain langchain-openai langgraph python-dotenv tiktoken



In [6]:
import asyncio
from typing import TypedDict, Annotated, Sequence, Literal
from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, AIMessage
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolNode
from langgraph.graph.message import add_messages

In [9]:
import re

In [7]:
@tool
def extract_video_id(url: str) -> str:
    """
    Extracts the 11-character YouTube video ID from a URL.

    Args:
        url (str): A YouTube URL containing a video ID.

    Returns:
        str: Extracted video ID or error message if parsing fails.
    """

    # Regex pattern to match video IDs
    pattern = r'(?:v=|be/|embed/)([a-zA-Z0-9_-]{11})'
    match = re.search(pattern, url)
    return match.group(1) if match else "Error: Invalid YouTube URL"

In [10]:
extract_video_id.run("https://www.youtube.com/watch?v=hfIUstzHs9A")

'hfIUstzHs9A'

In [11]:
tools = []
tools.append(extract_video_id)

In [12]:
from youtube_transcript_api import YouTubeTranscriptApi


@tool
def fetch_transcript(video_id: str, language: str = "en") -> str:
    """
    Fetches the transcript of a YouTube video.

    Args:
        video_id (str): The YouTube video ID (e.g., "dQw4w9WgXcQ").
        language (str): Language code for the transcript (e.g., "en", "es").

    Returns:
        str: The transcript text or an error message.
    """

    try:
        ytt_api = YouTubeTranscriptApi()
        transcript = ytt_api.fetch(video_id, languages=[language])
        return " ".join([snippet.text for snippet in transcript.snippets])
    except Exception as e:
        return f"Error: {str(e)}"

In [13]:
fetch_transcript.run("hfIUstzHs9A")

'Over the past couple of months, large language models, or LLMs, such as chatGPT, have taken the world by storm. Whether it\'s writing poetry or helping plan your upcoming vacation, we are seeing a step change in the performance of AI and its potential to drive enterprise value. My name is Kate Soule. I\'m a senior manager of business strategy at IBM Research, and today I\'m going to give a brief overview of this new field of AI that\'s emerging and how it can be used in a business setting to drive value. Now, large language models are actually a part of a different class of models called foundation models. Now, the term "foundation models" was actually first coined by a team from Stanford when they saw that the field of AI was converging to a new paradigm. Where before AI applications were being built by training, maybe a library of different AI models, where each AI model was trained on very task-specific data to perform very specific task. They predicted that we were going to start 

In [20]:
import os
import math
import re
import json
from typing import List, Dict, Any
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

MODEL_NAME = "gpt-4o"
TEMPERATURE = 0.0
CHUNK_CHAR_SIZE = 3500
SUMMARY_MAX_TOKENS = 400
KEY_CONCEPTS_COUNT = 8
QUIZ_QUESTIONS = 6

def get_llm():
    api_key = os.environ.get("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not set in environment. Set it before calling get_llm().")
    return ChatOpenAI(model=MODEL_NAME, temperature=TEMPERATURE)

def chunk_text(text: str, max_chars: int = CHUNK_CHAR_SIZE) -> List[str]:
    if not text:
        return []
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + max_chars, n)
        if end < n:
            last_period = text.rfind(". ", start, end)
            last_newline = text.rfind("\n", start, end)
            cut = max(last_period + 2, last_newline + 1, -1)
            if cut >= start:
                end = cut
        chunks.append(text[start:end].strip())
        start = end
    return chunks

def call_llm_prompt(prompt_messages: List[dict]) -> str:
    llm = get_llm()
    messages = []
    for m in prompt_messages:
        role = m.get("role", "user")
        content = m.get("content", "")
        if role == "system":
            messages.append(SystemMessage(content=content))
        else:
            messages.append(HumanMessage(content=content))
    resp = llm.invoke(messages)
    if hasattr(resp, "content") and resp.content:
        return resp.content
    if isinstance(resp, dict) and "content" in resp:
        return resp["content"]
    return str(resp)

def summarize_transcript(transcript: str) -> str:
    if not transcript:
        return "No transcript available."
    chunks = chunk_text(transcript)
    chunk_summaries = []
    for i, chunk in enumerate(chunks):
        prompt = [
            {"role": "system", "content": "You are a concise summarization assistant."},
            {
                "role": "user",
                "content": (
                    f"Summarize the following portion of a lecture/transcript in 3-5 short bullet points. "
                    "Be concise and focus on the main points.\n\n"
                    f"PORTION (chunk {i+1}/{len(chunks)}):\n{chunk}"
                ),
            },
        ]
        out = call_llm_prompt(prompt)
        chunk_summaries.append(out.strip())
    merged_text = "\n\n".join(chunk_summaries)
    merge_prompt = [
        {"role": "system", "content": "You are a helpful assistant that produces a short coherent summary."},
        {
            "role": "user",
            "content": (
                "Given the following bullet-style summaries from different parts of a lecture, "
                f"produce a single concise summary (around {SUMMARY_MAX_TOKENS//10}–{SUMMARY_MAX_TOKENS//5} words) "
                "that captures the overall topic, the main arguments/steps, and the important conclusions. "
                "Use 4–8 short sentences.\n\n"
                f"INPUT:\n{merged_text}"
            ),
        },
    ]
    final_summary = call_llm_prompt(merge_prompt).strip()
    return final_summary

def extract_key_concepts(transcript: str, n_concepts:int = KEY_CONCEPTS_COUNT) -> List[Dict[str,str]]:
    prompt = [
        {"role":"system", "content":"You are a concise educational assistant who extracts important concepts."},
        {"role":"user", "content": (
            f"From the following transcript, list the {n_concepts} most important concepts or terms a student should learn. "
            "For each concept, give a one-sentence plain-English explanation and (optionally) a short example or formula.\n\n"
            f"TRANSCRIPT:\n{(transcript[:20000] + '...') if len(transcript) > 20000 else transcript}"
        )},
    ]
    out = call_llm_prompt(prompt)
    lines = [ln.strip() for ln in out.splitlines() if ln.strip()]
    concepts = []
    for ln in lines:
        if len(concepts) >= n_concepts:
            break
        parts = re.split(r"\s*[:\-–—]\s*", ln, maxsplit=1)
        if len(parts) == 2:
            concept = parts[0].strip("0123456789. )")
            explanation = parts[1].strip()
            concepts.append({"concept": concept, "explanation": explanation})
        else:
            concepts.append({"concept": ln[:60], "explanation": ln[60:].strip() or "See transcript."})
    return concepts[:n_concepts]

def _synthesize_quiz_from_concepts(concepts: List[Dict[str,str]], n_questions: int):
    items = []
    for i in range(min(n_questions, len(concepts))):
        c = concepts[i]
        q = f"What is the best short description of \"{c['concept']}\"?"
        correct = c['explanation']
        distractors = []
        for j in range(len(concepts)):
            if j != i and len(distractors) < 3:
                distractors.append(concepts[j]['explanation'].split('.')[0])
        while len(distractors) < 3:
            distractors.append("None of the above")
        options = [correct] + distractors
        import random
        random.shuffle(options)
        answer_index = options.index(correct)
        items.append({
            "question": q,
            "options": options,
            "answer_index": answer_index,
            "explanation": correct
        })
    return items

def generate_quiz(transcript: str, n_questions:int = QUIZ_QUESTIONS) -> List[Dict[str,Any]]:
    prompt = [
        {"role":"system","content":"You are an instructor creating accurate multiple-choice quiz questions. Return ONLY valid JSON."},
        {"role":"user","content": (
            f"Create {n_questions} multiple-choice questions (4 options each) based strictly on the transcript below.\n"
            "Return EXACTLY a JSON array. Each element must be:\n"
            '{"question": "...", "options": ["A text","B text","C text","D text"], "answer_index": 0, "explanation": "..."}\n'
            "answer_index must match the correct option.\n\n"
            f"TRANSCRIPT:\n{(transcript[:20000] + '...') if len(transcript) > 20000 else transcript}"
        )}
    ]
    out = call_llm_prompt(prompt)
    #print("RAW QUIZ OUTPUT:\n", out[:4000])
    # try to locate a JSON substring in the output
    json_text = None
    try:
        json_text = out.strip()
        parsed = json.loads(json_text)
    except Exception:
        # try to extract first JSON array substring
        m = re.search(r"(\[.*\])", out, flags=re.S)
        if m:
            candidate = m.group(1)
            try:
                parsed = json.loads(candidate)
                json_text = candidate
            except Exception:
                parsed = None
        else:
            parsed = None

    if parsed and isinstance(parsed, list):
        items = []
        for item in parsed[:n_questions]:
            if (
                isinstance(item, dict) and
                "question" in item and
                "options" in item and
                "answer_index" in item and
                "explanation" in item and
                isinstance(item["options"], list) and
                len(item["options"]) == 4
            ):
                items.append({
                    "question": item["question"],
                    "options": item["options"],
                    "answer_index": int(item["answer_index"]),
                    "explanation": item["explanation"]
                })
        if items:
            return items

    # Retry once with a stricter JSON-only instruction
    retry_prompt = [
        {"role":"system","content":"You must return ONLY a pure JSON array and nothing else."},
        {"role":"user","content": (
            f"The previous output was not parseable. Now return EXACTLY a JSON array with {n_questions} objects of the form:\n"
            '{"question":"...","options":["A","B","C","D"],"answer_index":0,"explanation":"..."}\n\n'
            f"TRANSCRIPT:\n{(transcript[:20000] + '...') if len(transcript) > 20000 else transcript}"
        )}
    ]
    out2 = call_llm_prompt(retry_prompt)
    print("RAW QUIZ OUTPUT (retry):\n", out2[:4000])
    try:
        parsed2 = json.loads(out2)
        items2 = []
        for item in parsed2[:n_questions]:
            if (
                isinstance(item, dict) and
                "question" in item and
                "options" in item and
                "answer_index" in item and
                "explanation" in item and
                isinstance(item["options"], list) and
                len(item["options"]) == 4
            ):
                items2.append({
                    "question": item["question"],
                    "options": item["options"],
                    "answer_index": int(item["answer_index"]),
                    "explanation": item["explanation"]
                })
        if items2:
            return items2
    except Exception:
        pass

    # As a last resort, synthesize quiz from key concepts
    try:
        concepts = extract_key_concepts(transcript, n_concepts= max(3, n_questions))
        fallback = _synthesize_quiz_from_concepts(concepts, n_questions)
        if fallback:
            print("FALLBACK: generated quiz from key concepts.")
            return fallback
    except Exception as e:
        print("Fallback synthesis failed:", e)

    return []

def analyze_video(url: str, language:str="en", n_concepts:int=KEY_CONCEPTS_COUNT, n_quiz:int=QUIZ_QUESTIONS) -> Dict[str,Any]:
    vid_id = extract_video_id.run(url)
    if vid_id.startswith("Error"):
        raise ValueError(f"Could not extract video id: {vid_id}")
    transcript = fetch_transcript.run(vid_id, language=language)
    if transcript.startswith("Error"):
        raise RuntimeError(f"Error fetching transcript: {transcript}")
    summary = summarize_transcript(transcript)
    concepts = extract_key_concepts(transcript, n_concepts)
    quiz = generate_quiz(transcript, n_quiz)
    return {
        "video_id": vid_id,
        "transcript": transcript,
        "summary": summary,
        "key_concepts": concepts,
        "quiz": quiz
    }

if __name__ == "__main__":
    import sys
    if os.environ.get("OPENAI_API_KEY") is None:
        print("ERROR: OPENAI_API_KEY not set. In Colab do:\n"
              "from google.colab import userdata\n"
              "import os\n"
              "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n")
        sys.exit(1)
    test_url = "https://www.youtube.com/watch?v=x7X9w_GIm1s"
    print("Analyzing", test_url)
    result = analyze_video(test_url)
    print("\nVIDEO ID:", result["video_id"])
    print("\n--- SUMMARY ---\n", result["summary"])
    print("\n--- KEY CONCEPTS ---")
    for i,c in enumerate(result["key_concepts"],1):
        print(f"{i}. {c['concept']}: {c['explanation']}")
    print("\n--- QUIZ ---")
    for i,q in enumerate(result["quiz"],1):
        print(f"\nQ{i}: {q['question']}")
        for idx,opt in enumerate(q['options']):
            print(f"  {['A','B','C','D'][idx]}. {opt}")
        print("Answer:", ['A','B','C','D'][q['answer_index']])
        print("Explanation:", q["explanation"])

Analyzing https://www.youtube.com/watch?v=x7X9w_GIm1s

VIDEO ID: x7X9w_GIm1s

--- SUMMARY ---
 Python is a high-level, interpreted programming language created by Guido van Rossum in 1991, named after Monty Python's Flying Circus. Known for its readability and simplicity, Python is popular among both beginners and advanced developers. It is widely used in server-side applications, big data analysis, and machine learning. Python supports multiple programming paradigms, including functional and object-oriented programming, and boasts a vast ecosystem of third-party libraries. Its syntax, which uses indentation for code structure, promotes efficient coding practices. The Zen of Python emphasizes readability, contributing to its widespread adoption.

--- KEY CONCEPTS ---
1. **Python Programming Language**: Python is a high-level, interpreted programming language known for its readability and simplicity, making it popular for both beginners and experienced developers.
2. **Zen of Python**: 