In [34]:

from langchain_core.messages import HumanMessage
import streamlit as st
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from youtube_transcript_api import YouTubeTranscriptApi
import re      
            
            
# _-----------------------------------------------------FUNCTIONS FOR RAG----------------------------------------------

# ------------------ Transcript Loader ------------------
def load_transcript(url: str) -> str | None:
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    if match:
        video_id = match.group(1)
        try:
            captions = YouTubeTranscriptApi().fetch(video_id).snippets
            # join text + start_time
            data = [f"{item.text} ({item.start})" for item in captions]
            return " ".join(data)
        except Exception as e:
            print(f"Error fetching transcript: {e}")
            return None

# ------------------ Text Splitter ------------------
def text_splitter(transcript):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return splitter.create_documents([transcript])

# ------------------ Vector Store & Retriever  ------------------
def generate_embeddings(chunks):
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return FAISS.from_documents(chunks, embeddings)

def retriever_docs(vector_store):
    return vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

def format_docs(retrieved_docs):
    return "\n\n".join(doc.page_content for doc in retrieved_docs)


In [39]:
# ------------------ Imports ------------------
from dotenv import load_dotenv
load_dotenv()

from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, START, END
from langgraph.checkpoint.memory import InMemorySaver
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from pydantic import BaseModel, Field
from typing import TypedDict, Annotated
import re
import os
from langchain.prompts import PromptTemplate
#os.environ["LANGCHAIN_PROJECT"] = "TubeTalkAI Testing"

# ------------------ Build LLM (Gemini) ------------------
model = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

#--------------------Prompt Template----------------------
template = """
You are a helpful assistant.
Answer ONLY from the provided transcript context and also mention the time given in brackets.
If the context is insufficient, just say you don't know.

Context:
{transcript}

Question: {question}
"""

prompt = PromptTemplate(
    input_variables=["transcript", "question"],
    template=template,
)
# ------------------ Structured Output Schema ------------------
class AnsandTime(BaseModel):
    answer:str = Field(
        description="Answers to user's question (do NOT include timestamps here)"
    )
    timestamps: float = Field(
        description="The time (in seconds) from where the answer is taken"
    )

structured_model = model.with_structured_output(AnsandTime)

# ------------------ ChatState ------------------
class ChatState(TypedDict):
    messages: Annotated[list[BaseMessage], "add_messages"]

# ------------------ Chat Node ------------------
def chat_node(state: ChatState):
    # Extract user question from state
    user_message = state["messages"][-1].content  # last message is the user's input

    # Fill the prompt
    final_prompt = prompt.format(
        transcript=youtube_captions,   # <-- your transcript goes here
        question=user_message
    )

    # Get structured output
    response = structured_model.invoke(final_prompt)
    ai_text = f"Answer: {response.answer}\nTimestamp: {response.timestamps}"

    return {
        "messages": [
            state["messages"][-1],  # include the HumanMessage again
            AIMessage(content=ai_text)  # add the AI reply
        ]
    }

# ------------------ Build Graph ------------------
checkpointer = InMemorySaver()

graph = StateGraph(ChatState)
graph.add_node("chat_node", chat_node)
graph.add_edge(START, "chat_node")
graph.add_edge("chat_node", END)

CONFIG = {'configurable': {'thread_id': "newthread"}}
workflow =graph.compile(checkpointer = checkpointer)
output_dict = {"human": [], "ai": []}


In [40]:
# ------------------ Load YouTube Transcript ------------------
youtube_input = "https://www.youtube.com/watch?v=s3KnSb9b4Pk"
youtube_captions = load_transcript(youtube_input)
print("Transcript Loaded:", youtube_captions[:200], "...")

# Split & Embed transcript
chunks = text_splitter(youtube_captions)
vector_store = generate_embeddings(chunks)
retriever = retriever_docs(vector_store)


Transcript Loaded: Hello all, my name is Krishna Nayak and (0.48) welcome to my YouTube channel. So guys, (2.56) today in this particular video, we are (4.96) going to go ahead and see the entire (7.12) road map to lear ...


In [46]:
user_input = "When to learn Modern AU"
CONFIG = {'configurable': {'thread_id': "newthread"}}
retrieved_chunks = retriever.get_relevant_documents(user_input)
context = format_docs(retrieved_chunks)
print("user :", user_input)


result = workflow.invoke(
        {'messages': [HumanMessage(content=user_input)]},
        config=CONFIG,
    )
for msg in result['messages']:
    if isinstance(msg, HumanMessage):
        if msg.content not in output_dict['human']:
            output_dict['human'].append(msg.content)
    elif isinstance(msg, AIMessage):
        if msg.content not in output_dict['ai']:
            output_dict['ai'].append(msg.content)

print("AI:", output_dict['ai'][-1])

user : When to learn Modern AU
AI: Answer: The modern route to learning AI involves mastering generative AI first, focusing on LLMs and building generative AI applications.  After that, add agentic AI and MCP. Finally, learn DS fundamentals. This route is suitable for freshers and experienced professionals, even those with non-technical backgrounds. (405.039-558.959)
Timestamp: 405.039


In [47]:
output_dict

{'human': ['what is this video about', 'When to learn Modern AU'],
 'ai': ['Answer: This video is about the roadmap to learn AI in 2025. It includes three different routes: traditional, modern, and advanced.  The speaker discusses what skills are needed for each route and provides free resources and videos.\nTimestamp: 0.48',
  'Answer: This video is about the roadmap to learn AI in 2025. It includes three different routes: traditional, modern, and advanced.  The speaker discusses what each route entails and provides free resources and videos.\nTimestamp: 0.48',
  'Answer: The modern route to learning AI involves mastering generative AI first, focusing on LLMs and building generative AI applications.  After that, add agentic AI and MCP. Finally, learn DS fundamentals. This route is suitable for freshers and experienced professionals, even those with non-technical backgrounds. (405.039-558.959)\nTimestamp: 405.039']}