# Topic TimeStamps

In [87]:

from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.output_parsers import PydanticOutputParser

# -------------------------
# 1) Pydantic output schema
# -------------------------
class Subtopic(BaseModel):
    subtopic: str = Field(description="Short name or description of the subtopic")
    timestamp: float = Field(description="Approx timestamp in seconds where this subtopic is discussed")
    importance: Optional[str] = Field(default=None, description="Optional importance: high/medium/low")

class MainTopic(BaseModel):
    topic: str = Field(description="Main topic name or short description")
    timestamp: float = Field(description="Approx timestamp in seconds where the main topic starts")
    subtopics: List[Subtopic] = Field(description="List of subtopics under this main topic")

class TopicsOutput(BaseModel):
    main_topics: List[MainTopic] = Field(description="List of main topics with subtopics and timestamps")

# Create parser to enforce output JSON matches schema
parser = PydanticOutputParser(pydantic_object=TopicsOutput)
format_instructions = parser.get_format_instructions()

# -------------------------
# 2) System message prompt
# -------------------------
system_message = SystemMessagePromptTemplate.from_template(
    """You are an expert in analyzing and structuring video transcripts.

You will receive a transcript of a YouTube video with timestamps.

Your task is to:
1. Extract all MAIN TOPICS discussed in the transcript.
2. For each MAIN TOPIC, list its SUBTOPICS in a hierarchical structure.
3. Always include timestamp references (in seconds) for both MAIN TOPICS and SUBTOPICS.
4. For each subtopic, optionally add an 'importance' (high/medium/low) if it is clearly emphasized.
5. Be concise and only include material that is actually discussed in the transcript.
6. Output must be valid JSON and match the schema instructions below.

REQUIRED OUTPUT FORMAT:
{format_instructions}

Transcript (will be supplied by the user below).
"""
)

# ---------------------------------------
# 3) Human prompt (we supply the transcript)
# ---------------------------------------
human_message = HumanMessagePromptTemplate.from_template(
    """Transcript:
{transcript}

Notes:
- Use timestamps in seconds (floats allowed).
- Only include main topics and subtopics actually present in the transcript.
- If something is unclear, omit it rather than inventing timestamps.

Now extract main topics and subtopics."""
)

chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

# -------------------------
# 4) VertexAI model config
# -------------------------
# Make sure GOOGLE_APPLICATION_CREDENTIALS env var is set to your service account json file.
# The langchain VertexAI wrapper will pick up credentials automatically.
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)
# -------------------------
# 5) Runner function
# -------------------------
def extract_topics_from_transcript(transcript: str) -> TopicsOutput:
    """
    transcript: the transcript text (should contain timestamps or speaker timestamps)
    returns: TopicsOutput (pydantic object) containing main_topics -> subtopics
    """
    # Build the prompt (the format_instructions are injected into system message already)
    prompt = chat_prompt.format_prompt(transcript=transcript, format_instructions=format_instructions)
    messages = prompt.to_messages()  # list of BaseMessage objects (system + human)

    # Call the VertexAI chat model
    # predict_messages returns a BaseMessage in many LangChain chat wrappers
    response_message = model.predict_messages(messages)

    # response_message.content contains the model's text output (expected JSON)
    raw_output = response_message.content

    # Parse into Pydantic model (this will raise if model output isn't valid JSON per schema)
   
    return raw_output,response_message


In [88]:
from youtube_transcript_api import YouTubeTranscriptApi
import re
def load_transcript(url: str) -> str | None:
    """
    Fetch transcript for a YouTube video.
    """
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    if match:
        video_id = match.group(1)
        try:
            captions = YouTubeTranscriptApi().fetch(video_id,languages=['en','hi']).snippets
            data = [f"{item.text} ({item.start})" for item in captions]
            return " ".join(data)
        except Exception as e:
            print(f"❌ Error fetching transcript: {e}")
            return None
from dataclasses import dataclass
@dataclass
class TimestampedSegment:
    """Represents a segment of transcript with timestamp"""
    text: str
    start_time: float
    end_time: float = None
    
def parse_transcript(transcript: str) -> List[TimestampedSegment]:

        segments = []
        
        # Regular expression to find text and timestamps
        # Pattern: captures text followed by timestamp in parentheses
        pattern = r'(.*?)\((\d+\.?\d*)\)'
        
        matches = re.findall(pattern, transcript)
        
        for i, (text, timestamp) in enumerate(matches):
            text = text.strip()
            if text:  # Only add non-empty text segments
                segment = TimestampedSegment(
                    text=text,
                    start_time=float(timestamp),
                    end_time=float(matches[i+1][1]) if i+1 < len(matches) else None
                )
                segments.append(segment)
        
        return segments

captions = load_transcript("https://www.youtube.com/watch?v=nQa31xdXbGk")
segments = parse_transcript(captions)
formatted = []
for segment in segments:
    formatted.append(f"[{segment.start_time}s] {segment.text}")
    
raw_output , response_message = extract_topics_from_transcript(" ".join(formatted))
    


In [89]:
parser.parse(raw_output)

TopicsOutput(main_topics=[MainTopic(topic='Introduction to MCP Architecture', timestamp=76.4, subtopics=[Subtopic(subtopic='Simplified Architecture: Host and Server', timestamp=119.759, importance='high'), Subtopic(subtopic='Defining Host (AI Chatbot)', timestamp=154.239, importance=None), Subtopic(subtopic='Defining Server (Tool for specific tasks)', timestamp=210.879, importance=None), Subtopic(subtopic='Communication Flow Example (User to GitHub Server)', timestamp=254.08, importance=None)]), MainTopic(topic='Refining MCP Architecture: Introducing the Client', timestamp=349.919, subtopics=[Subtopic(subtopic='Host does not communicate directly with Server', timestamp=374.4, importance=None), Subtopic(subtopic='MCP Client as a Helper', timestamp=387.52, importance=None), Subtopic(subtopic="Client's role in communication", timestamp=418.479, importance=None), Subtopic(subtopic='One-on-one Client-Server Relationship', timestamp=530.959, importance=None), Subtopic(subtopic='Analogy: Phon

In [90]:
# Nicely formatted display of main topics and subtopics
for i, topics in enumerate(parser.parse(raw_output).main_topics, 1):
    print(f"\n🎯 Main Topic {i}: {topics.topic}  ⏰ {topics.timestamp}")
    print("----------------------------------------------------")

    for j, sub in enumerate(topics.subtopics, 1):
        print(f"   🔹 Subtopic {i}.{j}: {sub.subtopic}  ⏰ {sub.timestamp} {sub.importance}")

    print("====================================================")



🎯 Main Topic 1: Introduction to MCP Architecture  ⏰ 76.4
----------------------------------------------------
   🔹 Subtopic 1.1: Simplified Architecture: Host and Server  ⏰ 119.759 high
   🔹 Subtopic 1.2: Defining Host (AI Chatbot)  ⏰ 154.239 None
   🔹 Subtopic 1.3: Defining Server (Tool for specific tasks)  ⏰ 210.879 None
   🔹 Subtopic 1.4: Communication Flow Example (User to GitHub Server)  ⏰ 254.08 None

🎯 Main Topic 2: Refining MCP Architecture: Introducing the Client  ⏰ 349.919
----------------------------------------------------
   🔹 Subtopic 2.1: Host does not communicate directly with Server  ⏰ 374.4 None
   🔹 Subtopic 2.2: MCP Client as a Helper  ⏰ 387.52 None
   🔹 Subtopic 2.3: Client's role in communication  ⏰ 418.479 None
   🔹 Subtopic 2.4: One-on-one Client-Server Relationship  ⏰ 530.959 None
   🔹 Subtopic 2.5: Analogy: Phone, SIM, Network  ⏰ 610.72 None
   🔹 Subtopic 2.6: Benefits of Client-Server Architecture (Decoupling, Scalability)  ⏰ 733.279 high

🎯 Main Topic 3: Re

Beyond hardwired preferences for sugar and fat--842.8
Three channels for food preference (yum, yuck, meh analysis)--864.399
Channel 1: Taste on the mouth--896.48
Sensations (palatability, consistency) and chemical sensors (bitter, sweet, umami, salty, sour)--901.6
Neural pathway: gustatory nerve to brain stem (nucleus of solitary tract) to insular cortex--980.48
Insular cortex and interoception (perception of internal body state)--999.44
Taste perception is a central brain phenomenon, not just mouth sensation--1031.039
Taste preference can be uncoupled from brain reward systems--1048.4
Channel 2: Subconscious gut signaling--1109.12
Neurons throughout the digestive tract sense mechanical and chemical properties of food--1122.799
Neuropod cells sense amino acids, sugars, and fatty acids--1175.36
Gut signals travel via noo's ganglia to the brain, triggering dopamine release--1197.36
Dopamine inspires motivation, reward, and seeking for foods--1211.36
Channel 3: Learned association and bel

In [None]:
topics