# Testing 1

In [None]:
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, START, END
from langgraph.checkpoint.memory import InMemorySaver
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from pydantic import BaseModel, Field
from typing import TypedDict, Annotated
import re
import os
from youtube_transcript_api import YouTubeTranscriptApi
# ------------------ Transcript Loader ------------------
def load_transcript(url: str) -> str | None:
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    if match:
        video_id = match.group(1)
        try:
            captions = YouTubeTranscriptApi().fetch(video_id).snippets
            # join text + start_time
            data = [f"{item.text} ({item.start})" for item in captions]
            return " ".join(data)
        except Exception as e:
            print(f"Error fetching transcript: {e}")
            return None

from langchain.prompts import PromptTemplate
os.environ["LANGCHAIN_PROJECT"] = "TubeTalkAI Testing"

# ------------------ Build LLM (Gemini) ------------------
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

# ------------------ Structured Output Schema ------------------
class AnsandTime(BaseModel):
    answer: list[str] = Field(
        description="Answers to user's question (do NOT include timestamps here)"
    )
    timestamps: float = Field(
        description="The time (in seconds) from where the answer is taken"
    )

structured_model = model.with_structured_output(AnsandTime)
youtube_input = "https://www.youtube.com/watch?v=s3KnSb9b4Pk"
youtube_captions = load_transcript(youtube_input)
print("Transcript Loaded")
# ------------------ ChatState ------------------
class ChatState(TypedDict):
    messages: Annotated[list[BaseMessage], "add_messages"]
   
# ------------------ Chat Node ------------------
def chat_node(state: ChatState):
   
    user_message = state["messages"][-1].content  
    response = structured_model.invoke(user_message)
    ai_text = f"{response.answer}\nTimestamp: {response.timestamps}"

    return {
        "messages": [
            state["messages"][-1],
            AIMessage(content=ai_text)
        ]
    }

# ------------------ Build Graph ------------------
checkpointer = InMemorySaver()

graph = StateGraph(ChatState)
graph.add_node("chat_node", chat_node)
graph.add_edge(START, "chat_node")
graph.add_edge("chat_node", END)

CONFIG = {'configurable': {'thread_id': "newthread"}}
workflow =graph.compile(checkpointer = checkpointer)



In [2]:
# ------------------ Chat Loop ------------------
output_dict = {
    "human": [],
    "ai": []
}
i = 0
user_inputs = ["what is this Video about" , "when to Modern AI" , "exit"]
# ---------- Chat Loop ----------
while True:
    
    user_input = user_inputs[i]
    i+=1
    print("user:" , user_input)
    if user_input.lower() in ["exit", "quit"]:
        print("Exiting chat.")
        break
    
    # First message -> prepend instructions + transcript
    user_input = f"""
    You are the YouTuber from the video, directly answering the viewer’s question.
    Rules:
    1. ONLY use the transcript provided below
    2. Give the answer in simple, clear sentences — without timestamps inside the text.
    3. ALWAYS return the exact timestamp (in seconds) from the transcript line you used.
        - Do NOT round or estimate timestamps.
        - If multiple transcript parts are relevant, return the most direct one.
    4. Do NOT add greetings, filler, or extra commentary.
    5. If the transcript does not answer, say: "Sorry, I didn’t talk about that in this video."

    Transcript:
    {youtube_captions}

    Question:
    {user_input}

    Output format (for schema):
    - "answer": A list of 1–3 short strings that directly answer the question (no timestamps here).
    - "timestamps": The exact timestamp (in seconds) from the transcript where the answer was found.
    """

    result = workflow.invoke(
        {'messages': [HumanMessage(content=user_input)]},
        config=CONFIG,
    )

    # ---------- Save messages ----------
    messages_list = result['messages']

    for msg in messages_list:
        if isinstance(msg, HumanMessage):
            output_dict['human'].append(msg.content) if msg.content not in output_dict['human'] else None
        elif isinstance(msg, AIMessage):
            output_dict['ai'].append(msg.content) if msg.content not in output_dict['ai'] else None 
    print("AI :" ,output_dict['ai'][-1])

user: what is this Video about
AI : ['This video is about the entire roadmap to learn AI in 2025.']
Timestamp: 7.12
user: when to Modern AI
AI : ['Nowadays, I usually prefer the modern route for learning AI. This route is suitable for freshers, experienced professionals (0-5 years of experience), non-technical people who want to start coding, leaders, and very experienced individuals.']
Timestamp: 413.44
user: exit
Exiting chat.


In [3]:
result['messages']

[HumanMessage(content='\n    You are the YouTuber from the video, directly answering the viewer’s question.\n    Rules:\n    1. ONLY use the transcript provided below\n    2. Give the answer in simple, clear sentences — without timestamps inside the text.\n    3. ALWAYS return the exact timestamp (in seconds) from the transcript line you used.\n        - Do NOT round or estimate timestamps.\n        - If multiple transcript parts are relevant, return the most direct one.\n    4. Do NOT add greetings, filler, or extra commentary.\n    5. If the transcript does not answer, say: "Sorry, I didn’t talk about that in this video."\n\n    Transcript:\n    Hello all, my name is Krishna Nayak and (0.48) welcome to my YouTube channel. So guys, (2.56) today in this particular video, we are (4.96) going to go ahead and see the entire (7.12) road map to learn AI in 2025 (9.679) and here I\'m going to provide you all (13.679) the free resources, videos and (15.92) materials. So please make sure that 

In [None]:
with open(r"C:\Users\prana\Desktop\PROJECTS\tubetalk.ai\testing-folder\youtube_short_rag_prompt.txt", "r") as file:
    text = file.read()

print(text)

 user_input = f"""
        You are the YouTuber from the video, directly answering the viewerâ€™s question.
        Rules:
        1. ONLY use the transcript provided below.
        2. Give the answer in simple, clear sentences â€” without timestamps inside the text.
        3. ALWAYS return the exact timestamp (in seconds) from the transcript line you used.
           - Do NOT round or estimate timestamps.
           - If multiple transcript parts are relevant, return the most direct one.
        4. Do NOT add greetings, filler, or extra commentary.
        5. If the transcript does not answer, say: "Sorry, I didnâ€™t talk about that in this video."

        Transcript:
        {youtube_captions}

        Question:
        {user_input}

        Output format (for schema):
        - "answer": A list of 1â€“3 short strings that directly answer the question (no timestamps here).
        - "timestamps": The exact timestamp (in seconds) from the transcript where the answer was found.
     

# Testing 2 Working ✅

In [1]:
from dotenv import load_dotenv
load_dotenv()
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, START, END
from langgraph.checkpoint.memory import InMemorySaver
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from pydantic import BaseModel, Field
from typing import TypedDict, Annotated
import re
import os
from youtube_transcript_api import YouTubeTranscriptApi
# ------------------ Transcript Loader ------------------
def load_transcript(url: str) -> str | None:
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11})'
    match = re.search(pattern, url)
    if match:
        video_id = match.group(1)
        try:
            captions = YouTubeTranscriptApi().fetch(video_id).snippets
            # join text + start_time
            data = [f"{item.text} ({item.start})" for item in captions]
            return " ".join(data)
        except Exception as e:
            print(f"Error fetching transcript: {e}")
            return None

from langchain.prompts import PromptTemplate
os.environ["LANGCHAIN_PROJECT"] = "TubeTalkAI Testing"

# ------------------ Build LLM (Gemini) ------------------
model = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

In [72]:
# ChatState to store list of messages
class ChatState(TypedDict):
    messages: Annotated[list[BaseMessage], add_messages]

#Structured Model BaseModel
class AnsandTime(BaseModel):
    answer: list[str] = Field(
        description="Answers to user's question (do NOT include timestamps here)"
    )
    timestamps: float = Field(
        description="The time (in seconds) from where the answer is taken"
    )

structured_model  = model.with_structured_output(AnsandTime)
# Node that takes history and gets next response
def chat_node(state: ChatState):
    user_message = state["messages"]
    response = structured_model.invoke(user_message)
    ai_text = f"{' '.join(response.answer)}\nTimestamp: {response.timestamps}"
    return {
        "messages": [
            state["messages"][-1],
            AIMessage(content=ai_text)
        ]
    }

# SQLite checkpoint database
checkpointer = InMemorySaver()
# Build graph
graph = StateGraph(ChatState)
graph.add_node("chat_node", chat_node)
graph.add_edge(START, "chat_node")
graph.add_edge("chat_node", END)

# Compile with checkpointing
chatbot = graph.compile(checkpointer=checkpointer)

# Config
CONFIG = {'configurable': {'thread_id': "newthread"}}

In [73]:
youtube_input = "https://www.youtube.com/watch?v=s3KnSb9b4Pk"
youtube_captions = load_transcript(youtube_input)
youtube_captions

"Hello all, my name is Krishna Nayak and (0.48) welcome to my YouTube channel. So guys, (2.56) today in this particular video, we are (4.96) going to go ahead and see the entire (7.12) road map to learn AI in 2025 (9.679) and here I'm going to provide you all (13.679) the free resources, videos and (15.92) materials. So please make sure that you (18.32) watch this video till the end. And why I (20.48) am actually making this specific video? (23.199) The reason is very simple because right (25.519) now generative AI, agentic AI that is (27.92) the talk of the town. Everybody should (30.88) at least have the idea about generative (33.84) AI application, generative agentic AI (36.079) applications, how you should go ahead (38.16) and build. Whether you are coming from a (40.0) developer background, whether you are (42.0) program managers, whether you are in the (43.44) leadership position, everybody should (46.16) know this. The reason is very simple (48.719) because here you'll be able t

In [74]:
if not checkpointer.storage:
    print("Checkpointer is empty ❌")
else:
    print("Checkpointer has data ✅")

Checkpointer is empty ❌


In [75]:
from langchain.prompts import PromptTemplate

# ------------------ Prompt Template ------------------
prompt_template = PromptTemplate(
    input_variables=["transcript", "question"],
    template="""
You are the YouTuber from the video, directly answering the viewer’s question.

Rules:
1. ONLY use the transcript provided below.
2. Give the answer in simple, clear sentences — without timestamps inside the text.
3. ALWAYS return the exact timestamp (in seconds) from the transcript line you used.
   - Do NOT round or estimate timestamps.
   - If multiple transcript parts are relevant, return the most direct one.
4. Do NOT add greetings, filler, or extra commentary.
5. If the transcript does not answer, say: "Sorry, I didn’t talk about that in this video."

Transcript:
{transcript}

Question:
{question}

Output format (for schema):
- "answer": A list of 1–3 short strings that directly answer the question (no timestamps here).
- "timestamps": The exact timestamp (in seconds) from the transcript where the answer was found.
"""
)


In [None]:
user_input = "what is this Video ABout" 
if not checkpointer.storage:
        formatted_prompt = prompt_template.format(
            transcript=youtube_captions,
            question=user_input
        )
        user_input = formatted_prompt
result = chatbot.invoke(
                    {'messages' : [HumanMessage(content=user_input)]},
                    config=CONFIG
            )  

result['messages']

[HumanMessage(content='\nYou are the YouTuber from the video, directly answering the viewer’s question.\n\nRules:\n1. ONLY use the transcript provided below.\n2. Give the answer in simple, clear sentences — without timestamps inside the text.\n3. ALWAYS return the exact timestamp (in seconds) from the transcript line you used.\n   - Do NOT round or estimate timestamps.\n   - If multiple transcript parts are relevant, return the most direct one.\n4. Do NOT add greetings, filler, or extra commentary.\n5. If the transcript does not answer, say: "Sorry, I didn’t talk about that in this video."\n\nTranscript:\nHello all, my name is Krishna Nayak and (0.48) welcome to my YouTube channel. So guys, (2.56) today in this particular video, we are (4.96) going to go ahead and see the entire (7.12) road map to learn AI in 2025 (9.679) and here I\'m going to provide you all (13.679) the free resources, videos and (15.92) materials. So please make sure that you (18.32) watch this video till the end. 

In [80]:
last_msg = result["messages"][-1]  # get the last message
if isinstance(last_msg, AIMessage):
    print("AI:", last_msg.content)


AI: This video is about the entire roadmap to learn AI in 2025.
Timestamp: 7.12


In [81]:
result['messages']

[HumanMessage(content='\nYou are the YouTuber from the video, directly answering the viewer’s question.\n\nRules:\n1. ONLY use the transcript provided below.\n2. Give the answer in simple, clear sentences — without timestamps inside the text.\n3. ALWAYS return the exact timestamp (in seconds) from the transcript line you used.\n   - Do NOT round or estimate timestamps.\n   - If multiple transcript parts are relevant, return the most direct one.\n4. Do NOT add greetings, filler, or extra commentary.\n5. If the transcript does not answer, say: "Sorry, I didn’t talk about that in this video."\n\nTranscript:\nHello all, my name is Krishna Nayak and (0.48) welcome to my YouTube channel. So guys, (2.56) today in this particular video, we are (4.96) going to go ahead and see the entire (7.12) road map to learn AI in 2025 (9.679) and here I\'m going to provide you all (13.679) the free resources, videos and (15.92) materials. So please make sure that you (18.32) watch this video till the end. 