# Topic TimeStamps

In [4]:

from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.prompts import (
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.output_parsers import PydanticOutputParser

# -------------------------
# 1) Pydantic output schema
# -------------------------
class Subtopic(BaseModel):
    subtopic: str = Field(description="Short name or description of the subtopic")
    content: str = Field(description="Brief summary of the subtopic")
    timestamp: float = Field(description="Approx timestamp in seconds where this subtopic is discussed")
    importance: Optional[str] = Field(default=None, description="Optional importance: high/medium/low")

class MainTopic(BaseModel):
    topic: str = Field(description="Main topic name or short description")
    content : str = Field(description="Brief summary of the main topic")
    timestamp: float = Field(description="Approx timestamp in seconds where the main topic starts")
    subtopics: List[Subtopic] = Field(description="List of subtopics under this main topic")

class TopicsOutput(BaseModel):
    main_topics: List[MainTopic] = Field(description="List of main topics with subtopics and timestamps")

# Create parser to enforce output JSON matches schema
parser = PydanticOutputParser(pydantic_object=TopicsOutput)
format_instructions = parser.get_format_instructions()

# -------------------------
# 2) System message prompt
# -------------------------
system_message = SystemMessagePromptTemplate.from_template(
    """You are an expert in analyzing and structuring video transcripts.

You will receive a transcript of a YouTube video with timestamps.

Your task is to:
1. Extract all MAIN TOPICS discussed in the transcript.
2. For each MAIN TOPIC, list its SUBTOPICS in a hierarchical structure.
3. Always include timestamp references (in seconds) for both MAIN TOPICS and SUBTOPICS.
4. For each subtopic, optionally add an 'importance' (high/medium/low) if it is clearly emphasized.
5. Be concise and only include material that is actually discussed in the transcript.
6. Output must be valid JSON and match the schema instructions below.

REQUIRED OUTPUT FORMAT:
{format_instructions}

Transcript (will be supplied by the user below).
"""
)

# ---------------------------------------
# 3) Human prompt (we supply the transcript)
# ---------------------------------------
human_message = HumanMessagePromptTemplate.from_template(
    """Transcript:
{transcript}

Notes:
- Use timestamps in seconds (floats allowed).
- Only include main topics and subtopics actually present in the transcript.
- If something is unclear, omit it rather than inventing timestamps.

Now extract main topics and subtopics."""
)

chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

# -------------------------
# 4) VertexAI model config
# -------------------------
# Make sure GOOGLE_APPLICATION_CREDENTIALS env var is set to your service account json file.
# The langchain VertexAI wrapper will pick up credentials automatically.
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)
# -------------------------
# 5) Runner function
# -------------------------


def extract_topics_from_transcript(transcript: str) -> TopicsOutput:
    prompt = chat_prompt.format_prompt(transcript=transcript, format_instructions=format_instructions)
    messages = prompt.to_messages()

    response_message = model.invoke(messages)
    raw_output = response_message.content

    # If content is a list → join into a single string
    if isinstance(raw_output, list):
        raw_output = " ".join(raw_output)

    # Remove markdown fences like ```json ... ```
    clean_output = raw_output.strip()
    if clean_output.startswith("```"):
        clean_output = clean_output.strip("`")
        # Sometimes model outputs like ```json\n{...}\n``` so split off first line
        clean_output = clean_output.split("\n", 1)[-1]

    # Parse into Pydantic object
    return parser.parse(clean_output)


In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
import re
def load_transcript(url: str) -> str | None:
    """
    Fetch transcript for a YouTube video.
    """
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    if match:
        video_id = match.group(1)
        try:
            captions = YouTubeTranscriptApi().fetch(video_id,languages=['en','hi']).snippets
            data = [f"{item.text} ({item.start})" for item in captions]
            return " ".join(data)
        except Exception as e:
            print(f"❌ Error fetching transcript: {e}")
            return None
from dataclasses import dataclass
@dataclass
class TimestampedSegment:
    """Represents a segment of transcript with timestamp"""
    text: str
    start_time: float
    end_time: float = None
    
def parse_transcript(transcript: str) -> List[TimestampedSegment]:

        segments = []
        
        # Regular expression to find text and timestamps
        # Pattern: captures text followed by timestamp in parentheses
        pattern = r'(.*?)\((\d+\.?\d*)\)'
        
        matches = re.findall(pattern, transcript)
        
        for i, (text, timestamp) in enumerate(matches):
            text = text.strip()
            if text:  # Only add non-empty text segments
                segment = TimestampedSegment(
                    text=text,
                    start_time=float(timestamp),
                    end_time=float(matches[i+1][1]) if i+1 < len(matches) else None
                )
                segments.append(segment)
        
        return segments




In [27]:


captions = load_transcript("https://youtu.be/sBHeMcxupmE")
segments = parse_transcript(captions)
formatted = []
for segment in segments:
    formatted.append(f"[{segment.start_time}s] {segment.text}")
    
output = extract_topics_from_transcript(" ".join(formatted))

In [52]:
# Nicely formatted display of main topics and subtopics
for i, topics in enumerate(output.main_topics, 1):
    print(f"\n🎯 Main Topic {i}: {topics.topic}  ⏰ {topics.timestamp}")
    #print(f"                  {topics.content}")
    print("----------------------------------------------------")

    for j, sub in enumerate(topics.subtopics, 1):
        print(f"   🔹 Subtopic {i}.{j}: {sub.subtopic}  ⏰ {sub.timestamp} {sub.importance}")
     #   print(f"                  {sub.content}")
        

    print("====================================================")



🎯 Main Topic 1: Introduction to MCP Life Cycle  ⏰ 21.52
----------------------------------------------------
   🔹 Subtopic 1.1: Definition of MCP Life Cycle  ⏰ 82.4 None
   🔹 Subtopic 1.2: Definition of Session  ⏰ 103.119 None
   🔹 Subtopic 1.3: Three Stages of MCP Life Cycle  ⏰ 189.28 None

🎯 Main Topic 2: Stage 1: Initialization Phase  ⏰ 245.04
----------------------------------------------------
   🔹 Subtopic 2.1: First Interaction and Key Activities  ⏰ 254.48 high
   🔹 Subtopic 2.2: Step 1: Client Sends Initialize Request  ⏰ 352.479 None
   🔹 Subtopic 2.3: Step 2: Server Responds  ⏰ 444.4 None
   🔹 Subtopic 2.4: Step 3: Client Sends Initialized Notification  ⏰ 492.24 None
   🔹 Subtopic 2.5: Important Rules During Initialization  ⏰ 540.64 high
   🔹 Subtopic 2.6: Practical Demonstration of Initialization  ⏰ 647.04 None

🎯 Main Topic 3: Version and Capability Negotiation  ⏰ 908.88
----------------------------------------------------
   🔹 Subtopic 3.1: Version Negotiation  ⏰ 913.839 h

# DataBase Integration

In [2]:
import sqlite3
import json
from datetime import datetime

# Connect to SQLite
conn = sqlite3.connect("ragDatabase.db", check_same_thread=False)
cursor = conn.cursor()

# Create table for storing transcript extractions per thread
cursor.execute("""
CREATE TABLE IF NOT EXISTS transcript_topics (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    thread_id TEXT,
    transcript TEXT,
    output_json TEXT,
    created_at TIMESTAMP
)
""")
#conn.commit()
cursor.execute("""
CREATE TABLE IF NOT EXISTS summary (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    thread_id TEXT,
    summary TEXT,
    created_at TIMESTAMP
)
""")
conn.commit()


In [None]:
def extract_topics_with_checkpoint(transcript: str, thread_id: str) -> TopicsOutput:
    # 1. Extract topics using LangChain model
    segments = parse_transcript(transcript)
    formatted = []
    for segment in segments:
        formatted.append(f"[{segment.start_time}s] {segment.text}")
    output = extract_topics_from_transcript(" ".join(formatted))
    
    # 2. Save to database with thread_id
    cursor.execute(
        "INSERT INTO transcript_topics (thread_id, transcript,output_json, created_at) VALUES (?, ?, ?, ?)",
        (thread_id, transcript,output.model_dump_json(), datetime.now())
    )
    conn.commit()
    return output


In [5]:
#extract_topics_with_checkpoint(transcript=captions,thread_id="test-thread-123")
thread_id = "test-123"
cursor.execute("SELECT output_json FROM transcript_topics WHERE thread_id=?", (thread_id,))
row = cursor.fetchone()

if row:
    output_json = row[0]  # JSON string from DB
    topics_output = TopicsOutput.model_validate_json(output_json)  # restore Pydantic object
    print(topics_output)


main_topics=[MainTopic(topic='Introduction to MCP Life Cycle', content="This section introduces the MCP Life Cycle, explaining its purpose as the 'rule book' for how the MCP architecture (host, client, server) works together during a session, step-by-step. It also briefly outlines the three main stages: Initialization, Normal Operation, and Shutdown.", timestamp=21.52, subtopics=[Subtopic(subtopic='Definition of MCP Life Cycle', content='The MCP Life Cycle describes the complete sequence of steps that govern how a host and a server establish, use, and end a connection during a session.', timestamp=82.4, importance=None), Subtopic(subtopic='Definition of Session', content='A session is defined as one continuous connection between the client and the server, illustrated with an example of Cloud Desktop connecting to a GitHub server.', timestamp=103.119, importance=None), Subtopic(subtopic='Three Stages of MCP Life Cycle', content='The MCP Life Cycle consists of three main stages: Initiali

In [6]:
# Nicely formatted display of main topics and subtopics
for i, topics in enumerate(topics_output.main_topics, 1):
    print(f"\n{i}: {topics.topic}  ⏰ {topics.timestamp}")
    #print(f"                  {topics.content}")
    print("----------------------------------------------------")

    for j, sub in enumerate(topics.subtopics, 1):
        print(f" {i}.{j}: {sub.subtopic}  ⏰ {sub.timestamp} {sub.importance}")
     #   print(f"                  {sub.content}")
        

    print("====================================================")



1: Introduction to MCP Life Cycle  ⏰ 21.52
----------------------------------------------------
 1.1: Definition of MCP Life Cycle  ⏰ 82.4 None
 1.2: Definition of Session  ⏰ 103.119 None
 1.3: Three Stages of MCP Life Cycle  ⏰ 189.28 None

2: Stage 1: Initialization Phase  ⏰ 245.04
----------------------------------------------------
 2.1: First Interaction and Key Activities  ⏰ 254.48 high
 2.2: Step 1: Client Sends Initialize Request  ⏰ 352.479 None
 2.3: Step 2: Server Responds  ⏰ 444.4 None
 2.4: Step 3: Client Sends Initialized Notification  ⏰ 492.24 None
 2.5: Important Rules During Initialization  ⏰ 540.64 high
 2.6: Practical Demonstration of Initialization  ⏰ 647.04 None

3: Version and Capability Negotiation  ⏰ 908.88
----------------------------------------------------
 3.1: Version Negotiation  ⏰ 913.839 high
 3.2: Capability Negotiation Overview  ⏰ 1019.199 high
 3.3: Client Capabilities  ⏰ 1056.16 None
 3.4: Server Capabilities  ⏰ 1254.559 None
 3.5: Sub-capabilities  ⏰