In [131]:

from langchain_core.messages import HumanMessage
import streamlit as st
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from youtube_transcript_api import YouTubeTranscriptApi
import re      
            
            
# _-----------------------------------------------------FUNCTIONS FOR RAG----------------------------------------------

# ------------------ Transcript Loader ------------------
def load_transcript(url: str) -> str | None:
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    if match:
        video_id = match.group(1)
        try:
            captions = YouTubeTranscriptApi().fetch(video_id).snippets
            # join text + start_time
            data = [f"{item.text} ({item.start})" for item in captions]
            return " ".join(data)
        except Exception as e:
            print(f"Error fetching transcript: {e}")
            return None

# ------------------ Text Splitter ------------------
def text_splitter(transcript):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return splitter.create_documents([transcript])

# ------------------ Vector Store & Retriever  ------------------
def generate_embeddings(chunks):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return FAISS.from_documents(chunks, embeddings)

def retriever_docs(vector_store):
    return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

def format_docs(retrieved_docs):
    return "\n\n".join(doc.page_content for doc in retrieved_docs)


In [132]:
# ------------------ Imports ------------------
from dotenv import load_dotenv
load_dotenv()

from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, START, END
from langgraph.checkpoint.memory import InMemorySaver
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from pydantic import BaseModel, Field
from typing import TypedDict, Annotated
import re
import os
from langchain.prompts import PromptTemplate
#os.environ["LANGCHAIN_PROJECT"] = "TubeTalkAI Testing"

# ------------------ Build LLM (Gemini) ------------------
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

#--------------------Prompt Template----------------------
template = """
You are the YouTuber from the video, directly answering the viewer’s question.

Rules:
1. ONLY use the transcript provided below.
2. Give the answer in simple, clear sentences — without timestamps inside the text.
3. ALWAYS return the exact timestamp (in seconds) from the transcript line you used.
   - Do NOT round or estimate timestamps.
   - If multiple transcript parts are relevant, return the most direct one.
4. Do NOT add greetings, filler, or extra commentary.
5. If the transcript does not answer, say: "Sorry, I didn’t talk about that in this video."

Transcript:
{transcript}

Question:
{question}

Output format (for schema):
- "answer": A list of 1–3 short strings that directly answer the question (no timestamps here).
- "timestamps": The exact timestamp (in seconds) from the transcript where the answer was found.
"""




prompt = PromptTemplate(
    input_variables=["transcript", "question"],
    template=template,
)
# ------------------ Structured Output Schema ------------------
class AnsandTime(BaseModel):
    answer:str = Field(
        description="Answers to user's question (do NOT include timestamps here)"
    )
    timestamps: float = Field(
        description="The time (in seconds) from where the answer is taken"
    )

structured_model = model.with_structured_output(AnsandTime)

# ------------------ ChatState ------------------
class ChatState(TypedDict):
    messages: Annotated[list[BaseMessage], "add_messages"]

# ------------------ Chat Node ------------------
def chat_node(state: ChatState):
    # Extract user question from state
    user_message = state["messages"][-1].content  # last message is the user's input

    # Fill the prompt
    final_prompt = prompt.format(
        transcript=context,   # <-- your transcript goes here
        question=user_message
    )

    # Get structured output
    response = structured_model.invoke(final_prompt)
    ai_text = f"{response.answer}\nTimestamp: {response.timestamps}"

    return {
        "messages": [
            state["messages"][-1],  # include the HumanMessage again
            AIMessage(content=ai_text)  # add the AI reply
        ]
    }

# ------------------ Build Graph ------------------
checkpointer = InMemorySaver()

graph = StateGraph(ChatState)
graph.add_node("chat_node", chat_node)
graph.add_edge(START, "chat_node")
graph.add_edge("chat_node", END)

CONFIG = {'configurable': {'thread_id': "newthread"}}
workflow =graph.compile(checkpointer = checkpointer)

In [133]:
# ------------------ Load YouTube Transcript ------------------
youtube_input = "https://www.youtube.com/watch?v=s3KnSb9b4Pk"
youtube_captions = load_transcript(youtube_input)
print("Transcript Loaded:", youtube_captions[:200], "...")

# Split & Embed transcript
chunks = text_splitter(youtube_captions)
vector_store = generate_embeddings(chunks)
retriever = retriever_docs(vector_store)


Transcript Loaded: Hello all, my name is Krishna Nayak and (0.48) welcome to my YouTube channel. So guys, (2.56) today in this particular video, we are (4.96) going to go ahead and see the entire (7.12) road map to lear ...


In [134]:
output_dict = {"human": [], "ai": []}
CONFIG = {'configurable': {'thread_id': "newthread"}}



In [135]:
while True : 
    user_input = input("User : ")
    if user_input == 'exit':
        break
    print("user :", user_input)
    retrieved_chunks = retriever.get_relevant_documents(user_input)
    context = format_docs(retrieved_chunks)
    result = workflow.invoke(
            {'messages': [HumanMessage(content=user_input)]},
            config=CONFIG,
        )
    for msg in result['messages']:
        if isinstance(msg, HumanMessage):
            if msg.content not in output_dict['human']:
                output_dict['human'].append(msg.content)
        elif isinstance(msg, AIMessage):
            if msg.content not in output_dict['ai']:
                output_dict['ai'].append(msg.content)

    print("AI:", output_dict['ai'][-1])

user : what is this Video About?
AI: This video is about the entire road map to learn AI in 2025.
Timestamp: 7.12
user : when should I learn Modern AI
AI: Nowadays, I usually prefer the modern route.
Timestamp: 413.44
user : Should I learn modern AI After learning Traditional AI
AI: If you want to get into the coding industry, you should definitely follow the traditional route, then go to the modern route, and then to the advanced route.
Timestamp: 695.6


In [136]:
context

"traditional route wherein you're (379.199) mastering everything like data science, (381.44) machine learning, CV and LP. This will (382.96) definitely be your base and on top of (385.44) that you are adding generative AI and (388.16) agentic AI skills. Okay. Now when I'm (390.24) also talking about generative AI and (394.0) agent AI skills, I'm also talking about (395.28) developing end to-end projects, doing (397.199) the deployment in cloud using some (399.12) amazing LLM ops tools, each and (401.12) everything. Okay. Now coming to the (402.88) second part which is the modern route (405.039) right nowadays if many of the people ask (406.8) me kish (409.6) which path should I probably go ahead (411.52) and take in order to learn AI nowadays I (413.44) usually prefer them modern route. Now (416.319) what is modern route? Here I will say (418.0) them hey go ahead and learn (420.24) generative AI first. (424.0) Okay. So here you go ahead and learn (426.479) about generative AI first. In

In [137]:
user : what is this Video About
AI: This video is about AI and geometry, and how a non-AI model was already better at geometry than most humans.
Timestamp: 80.44

SyntaxError: invalid syntax (1422482209.py, line 1)

In [None]:
print(output_dict['ai'][-1])

This video is about AI and geometry, and how a non-AI model was already better at geometry than most humans.
Timestamp: 80.44


In [None]:
result['messages']

[HumanMessage(content='what is this Video About', additional_kwargs={}, response_metadata={}),
 AIMessage(content='This video is about AI and geometry, and how a non-AI model was already better at geometry than most humans.\nTimestamp: 80.44', additional_kwargs={}, response_metadata={})]